From 1f4d7c25d36ff14c06149825d0de583e7af66e28 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 20 May 2021 14:07:01 -0400 Subject: [PATCH 01/85] add custom_hyperparameters --- .../automl_algorithm/iterative_algorithm.py | 9 +++++++- evalml/automl/automl_search.py | 22 +++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 8063ffd1b1..42116eb6f4 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -124,7 +124,12 @@ def next_batch(self): def _combine_parameters(self, pipeline, proposed_parameters): """Helper function for logic to transform proposed parameters and frozen parameters.""" - return {**self._transform_parameters(pipeline, proposed_parameters), **self._frozen_pipeline_parameters} + print(f'Iterative Algorithm - _combine_parameters - pipeline: {pipeline}') + print(f'Iterative Algorithm - _combine_parameters - proposed_parameters: {proposed_parameters}') + print(f'Iterative Algorithm - _combine_parameters - self._frozen_pipeline_parameters: {self._frozen_pipeline_parameters}') + _returning = {**self._transform_parameters(pipeline, proposed_parameters), **self._frozen_pipeline_parameters} + print(f'Iterative Algorithm - _combine_parameters - returning combined parameters: {_returning}') + return _returning def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline @@ -186,4 +191,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): if param_name in init_params: component_parameters[param_name] = value parameters[name] = component_parameters + print(f'Iterative Algorithm - _transform_parameters - component_parameters: {component_parameters}') + print(f'Iterative Algorithm - _transform_parameters - final transformed parameters: {parameters}') return parameters diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index e936cbc33f..f4db4c0473 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -142,6 +142,7 @@ def __init__(self, problem_configuration=None, train_best_pipeline=True, pipeline_parameters=None, + custom_hyperparameters=None, sampler_method="auto", sampler_balanced_ratio=0.25, _ensembling_split_size=0.2, @@ -222,7 +223,21 @@ def __init__(self, train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True. - pipeline_parameters (dict): A dict of the parameters used to initalize a pipeline with. + pipeline_parameters (dict): A dict of the parameters used to initialize a pipeline with. + Keys should consist of the component names and values should specify parameter values + e.g. pipeline_parameters = { + 'Imputer' : { + 'numeric_impute_strategy': 'most_frequent' + } + } + + custom_hyperparameters (dict): A dict of the hyperparameter ranges used to iterate over during search. + Keys should consist of the component names and values should specify lists or skopt.Space with length greater than 1 + e.g. custom_hyperparameters = { + 'Imputer' : { + 'numeric_impute_strategy': Categorical(['most_frequent', 'median']) + } + } sampler_method (str): The data sampling component to use in the pipelines if the problem type is classification and the target balance is smaller than the sampler_balanced_ratio. Either 'auto', which will use our preferred sampler for the data, 'Undersampler', 'Oversampler', or None. Defaults to 'auto'. @@ -342,6 +357,7 @@ def __init__(self, n_splits=3, shuffle=True, random_seed=self.random_seed) self.data_splitter = self.data_splitter or default_data_splitter self.pipeline_parameters = pipeline_parameters if pipeline_parameters is not None else {} + self.custom_hyperparameters = custom_hyperparameters if custom_hyperparameters is not None else {} self.search_iteration_plot = None self._interrupted = False self._frozen_pipeline_parameters = {} @@ -359,6 +375,7 @@ def __init__(self, if self.sampler_method in ['auto', 'Oversampler']: self._sampler_name = get_best_sampler_for_data(self.X_train, self.y_train, self.sampler_method, self.sampler_balanced_ratio) self._frozen_pipeline_parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} + parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} if self.allowed_pipelines is None: logger.info("Generating pipelines to search over...") @@ -368,7 +385,8 @@ def __init__(self, index_columns = list(self.X_train.select('index').columns) if len(index_columns) > 0 and drop_columns is None: self._frozen_pipeline_parameters['Drop Columns Transformer'] = {'columns': index_columns} - self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, custom_hyperparameters=parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] + parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} + self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, custom_hyperparameters=self.custom_hyperparameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] else: for pipeline in self.allowed_pipelines: if self.pipeline_parameters: From 33df64603696fa3be2fdaa0b86d0eb939aee54ed Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 20 May 2021 14:48:54 -0400 Subject: [PATCH 02/85] comment --- evalml/tuners/skopt_tuner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index 3b5bd46e1d..c3ae7d537a 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -34,10 +34,12 @@ def add(self, pipeline_parameters, score): Returns: None """ + print(f'SKOptTuner - add - pipeline parameters: {pipeline_parameters}') # skip adding nan scores if pd.isnull(score): return flat_parameter_values = self._convert_to_flat_parameters(pipeline_parameters) + print(f'SKOptTuner - add - flat_parameter_values: {flat_parameter_values}') try: self.opt.tell(flat_parameter_values, score) except Exception as e: From cc2bb32539ac0618cb971d09745a6f0f28be9626 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 21 May 2021 11:24:19 -0400 Subject: [PATCH 03/85] print --- evalml/tuners/tuner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 21b61e932f..08e35abde9 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -44,6 +44,7 @@ def _convert_to_flat_parameters(self, pipeline_parameters): component_name, parameter_name = self._parameter_names_map[flat_parameter_name] if component_name not in pipeline_parameters or parameter_name not in pipeline_parameters[component_name]: raise TypeError('Pipeline parameters missing required field "{}" for component "{}"'.format(parameter_name, component_name)) + print(f"tuner - _convert_to_flat_parameters - adding value to flat parameters: {component_name} - {parameter_name} - {pipeline_parameters[component_name][parameter_name]}") flat_parameter_values.append(pipeline_parameters[component_name][parameter_name]) return flat_parameter_values From 0ea03dea696f3815acce57fa50ab67aa49c25612 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 21 May 2021 11:37:41 -0400 Subject: [PATCH 04/85] print --- evalml/automl/automl_search.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index f4db4c0473..42311d5906 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -399,6 +399,11 @@ def __init__(self, if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") + from pprint import pprint as pp + for pipe_ in self.allowed_pipelines: + pp(f"automl_search - init - pipelines: {pipe_}") + pp(f"automl_search - init - pipelines parameters: {pipe_.parameters}") + logger.info(f"{len(self.allowed_pipelines)} pipelines ready for search.") check_all_pipeline_names_unique(self.allowed_pipelines) From d3323e0f127d5b3fde975ee8517e68ae08b6a4f5 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 21 May 2021 11:43:15 -0400 Subject: [PATCH 05/85] print --- evalml/pipelines/pipeline_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 89a61a916b..1d73db711e 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -321,6 +321,7 @@ def hyperparameters(self): if self.custom_hyperparameters and component_name in self.custom_hyperparameters: component_hyperparameters.update(self.custom_hyperparameters.get(component_name, {})) hyperparameter_ranges[component_name] = component_hyperparameters + print(f"pipeline_base - hyperparameters - hyperparameter_ranges: {hyperparameter_ranges}") return hyperparameter_ranges @property From a45a18726cdc8fc0002e37f015c39be02b6aaf53 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 21 May 2021 17:07:57 -0400 Subject: [PATCH 06/85] print statements --- evalml/automl/automl_algorithm/automl_algorithm.py | 3 +++ .../automl/automl_algorithm/iterative_algorithm.py | 13 ++++++++++--- evalml/automl/automl_search.py | 14 +++++++++++++- evalml/automl/engine/engine_base.py | 6 ++++++ evalml/automl/engine/sequential_engine.py | 1 + 5 files changed, 33 insertions(+), 4 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 0622289b32..fe195bd46c 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -55,6 +55,9 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """ if pipeline.name not in self._tuners: raise PipelineNotFoundError(f"No such pipeline allowed in this AutoML search: {pipeline.name}") + print(f"automlalgorithm - add_result - pipeline name: {pipeline.name}") + print(f"automlalgorithm - add_result - pipeline: {pipeline}") + print(f"automlalgorithm - add_result - pipeline parameters: {pipeline.parameters}") self._tuners[pipeline.name].add(pipeline.parameters, score_to_minimize) @property diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 42116eb6f4..0ea5b3ba61 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -91,6 +91,8 @@ def next_batch(self): next_batch = [] if self._batch_number == 0: + for pipeline in self.allowed_pipelines: + print(f"iterative_algorithm - next_batch - new pipeline parameters for batch 0: {self._combine_parameters(pipeline, {})}") next_batch = [pipeline.new(parameters=self._combine_parameters(pipeline, {}), random_seed=self.random_seed) for pipeline in self.allowed_pipelines] @@ -114,9 +116,11 @@ def next_batch(self): num_pipelines = (len(self._first_batch_results) + 1) if self.ensembling else len(self._first_batch_results) idx = (self._batch_number - 1) % num_pipelines pipeline = self._first_batch_results[idx][1] + print(f"iterative_algorithm - next_batch - pipeline: {pipeline.parameters}") for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() parameters = self._combine_parameters(pipeline, proposed_parameters) + print(f"iterative_algorithm - next_batch - new pipeline parameters: {parameters}") next_batch.append(pipeline.new(parameters=parameters, random_seed=self.random_seed)) self._pipeline_number += len(next_batch) self._batch_number += 1 @@ -162,19 +166,21 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): def _transform_parameters(self, pipeline, proposed_parameters): """Given a pipeline parameters dict, make sure n_jobs and number_features are set.""" + print('-------------------------') parameters = {} if 'pipeline' in self._pipeline_params: parameters['pipeline'] = self._pipeline_params['pipeline'] for name, component_class in pipeline.linearized_component_graph: component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters - + print(f"iterativealgorithm - _transform_parameters - init_params: {init_params}") # Inspects each component and adds the following parameters when needed if 'n_jobs' in init_params: component_parameters['n_jobs'] = self.n_jobs if 'number_features' in init_params: component_parameters['number_features'] = self.number_features # For first batch, pass the pipeline params to the components that need them + print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): if isinstance(value, (Integer, Real)): @@ -190,7 +196,8 @@ def _transform_parameters(self, pipeline, proposed_parameters): for param_name, value in self._pipeline_params['pipeline'].items(): if param_name in init_params: component_parameters[param_name] = value + print(f"iterativealgorithm - _transform_parameters - component_parameters: {component_parameters}") parameters[name] = component_parameters - print(f'Iterative Algorithm - _transform_parameters - component_parameters: {component_parameters}') - print(f'Iterative Algorithm - _transform_parameters - final transformed parameters: {parameters}') + print(f"iterativealgorithm - _transform_parameters - parameters: {parameters}") + print('-------------------------') return parameters diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 42311d5906..1483e44465 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -452,6 +452,9 @@ def __init__(self, logger.debug(f"allowed_pipelines set to {[pipeline.name for pipeline in self.allowed_pipelines]}") logger.debug(f"allowed_model_families set to {self.allowed_model_families}") + for pipe_ in self.allowed_pipelines: + print(f"automlsearch - init - pre iterative algorithm pipeline parameters: {pipe_.parameters}") + self._automl_algorithm = IterativeAlgorithm( max_iterations=self.max_iterations, allowed_pipelines=self.allowed_pipelines, @@ -614,7 +617,9 @@ def search(self, show_iteration_plot=True): log_title(logger, f"Evaluating Batch Number {self._get_batch_number()}") for pipeline in current_batch_pipelines: self._pre_evaluation_callback(pipeline) + print(f"automlsearch - search - pipeline pre submit evaluation: {pipeline.parameters}") computation = self._engine.submit_evaluation_job(self.automl_config, pipeline, self.X_train, self.y_train) + print(f"automlsearch - search - computation: {computation}") computations.append(computation) current_computation_index = 0 while self._should_continue() and len(computations) > 0: @@ -666,6 +671,7 @@ def _find_best_pipeline(self): if self._train_best_pipeline: X_train = self.X_train y_train = self.y_train + print(f"automlsearch - _add_baseline_pipelines - best_pipeline: {best_pipeline}") best_pipeline = self._engine.submit_training_job(self.automl_config, best_pipeline, X_train, y_train).get_result() self._best_pipeline = best_pipeline @@ -761,6 +767,7 @@ def _add_baseline_pipelines(self): baseline = self._get_baseline_pipeline() self._pre_evaluation_callback(baseline) logger.info(f"Evaluating Baseline Pipeline: {baseline.name}") + print(f"automlsearch - _add_baseline_pipelines - baseline: {baseline}") computation = self._engine.submit_evaluation_job(self.automl_config, baseline, self.X_train, self.y_train) evaluation = computation.get_result() data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger") @@ -830,6 +837,7 @@ def _post_evaluation_callback(self, pipeline, evaluation_results, job_log): if not is_baseline: score_to_minimize = -cv_score if self.objective.greater_is_better else cv_score try: + print(f"automlsearch - _post_evaluation_callback - pipeline: {pipeline}") self._automl_algorithm.add_result(score_to_minimize, pipeline, self._results['pipeline_results'][pipeline_id]) except PipelineNotFoundError: pass @@ -932,13 +940,15 @@ def add_to_rankings(self, pipeline): pipeline (PipelineBase): pipeline to train and evaluate. """ pipeline_rows = self.full_rankings[self.full_rankings['pipeline_name'] == pipeline.name] + print(f"automlsearch - add_to_rankings - pipeline_rows: {pipeline_rows}") for parameter in pipeline_rows['parameters']: if pipeline.parameters == parameter: return - + print(f"automlsearch - add_to_rankings - pipeline: {pipeline}") computation = self._engine.submit_evaluation_job(self.automl_config, pipeline, self.X_train, self.y_train) evaluation = computation.get_result() data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger") + print(f"automlsearch - add_to_rankings - pipeline: {pipeline}") self._post_evaluation_callback(pipeline, data, job_log) self._find_best_pipeline() @@ -1032,6 +1042,7 @@ def train_pipelines(self, pipelines): y_train = self.y_train for pipeline in pipelines: + print(f"automlsearch - train_pipelines - pipeline: {pipeline}") computations.append(self._engine.submit_training_job(self.automl_config, pipeline, X_train, y_train)) while computations: @@ -1070,6 +1081,7 @@ def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): computations = [] for pipeline in pipelines: + print(f"automlsearch - score_pipelines - pipeline: {pipeline}") computations.append(self._engine.submit_scoring_job(self.automl_config, pipeline, X_holdout, y_holdout, objectives)) while computations: diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index 5f9de711ef..ce427a3101 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -104,6 +104,9 @@ def train_pipeline(pipeline, X, y, optimize_thresholds, objective): Returns: pipeline (PipelineBase): trained pipeline. """ + print(f"engine_base - train_pipeline - pipeline: {pipeline}") + print(f"engine_base - train_pipeline - pipeline parameters: {pipeline.parameters}") + print(f"engine_base - train_pipeline - pipeline parameters: {pipeline.hyperparameters}") X_threshold_tuning = None y_threshold_tuning = None if optimize_thresholds and pipeline.can_tune_threshold_with_objective(objective): @@ -129,6 +132,9 @@ def train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train tuple of three items: First - A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. Second - The pipeline class we trained and scored. Third - the job logger instance with all the recorded messages. """ + print(f"engine_base - train_and_score_pipeline - pipeline: {pipeline}") + print(f"engine_base - train_and_score_pipeline - pipeline parameters: {pipeline.parameters}") + print(f"engine_base - train_and_score_pipeline - pipeline parameters: {pipeline.hyperparameters}") start = time.time() cv_data = [] logger.info("\tStarting cross validation") diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index 0912843ac5..349ea8506e 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -43,6 +43,7 @@ class SequentialEngine(EngineBase): def submit_evaluation_job(self, automl_config, pipeline, X, y): logger = self.setup_job_log() + print(f"sequentialengine - submit_evaluation_job - pipeline: {pipeline}") return SequentialComputation(work=evaluate_pipeline, pipeline=pipeline, automl_config=automl_config, X=X, From 8b31f5c8d1fc402f31bfdc91ec07b21ce82ca451 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Sat, 22 May 2021 09:14:50 -0400 Subject: [PATCH 07/85] remove custom hyperparameters from pipelines --- .../automl_algorithm/iterative_algorithm.py | 16 ++++++++++ evalml/automl/automl_search.py | 12 ++----- evalml/pipelines/pipeline_base.py | 32 ++----------------- 3 files changed, 21 insertions(+), 39 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 0ea5b3ba61..58ed13d442 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -34,6 +34,7 @@ def __init__(self, number_features=None, # TODO remove ensembling=False, pipeline_params=None, + custom_hyperparameters=None, _frozen_pipeline_parameters=None, _estimator_family_order=None): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. @@ -48,6 +49,7 @@ def __init__(self, number_features (int): The number of columns in the input features. ensembling (boolean): If True, runs ensembling in a separate batch after every allowed pipeline class has been iterated over. Defaults to False. pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. + custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. _frozen_pipeline_parameters (dict or None): Pipeline-level parameters are frozen and used in the proposed pipelines. _estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to _ESTIMATOR_FAMILY_ORDER. """ @@ -76,6 +78,7 @@ def __init__(self, self._best_pipeline_info = {} self.ensembling = ensembling and len(self.allowed_pipelines) > 1 self._pipeline_params = pipeline_params or {} + self._custom_hyperparameters = custom_hyperparameters or {} self._frozen_pipeline_parameters = _frozen_pipeline_parameters or {} def next_batch(self): @@ -171,6 +174,10 @@ def _transform_parameters(self, pipeline, proposed_parameters): if 'pipeline' in self._pipeline_params: parameters['pipeline'] = self._pipeline_params['pipeline'] for name, component_class in pipeline.linearized_component_graph: + print(f"iterativealgorithm - _transform_parameters - pipeline: {pipeline}") + print(f"iterativealgorithm - _transform_parameters - pipeline.linearized_component_graph: {pipeline.linearized_component_graph}") + print(f"iterativealgorithm - _transform_parameters - pipeline parameters: {pipeline.parameters}") + print(f"iterativealgorithm - _transform_parameters - pipeline hyperparameters: {pipeline.custom_hyperparameters}") component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters print(f"iterativealgorithm - _transform_parameters - init_params: {init_params}") @@ -181,6 +188,15 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters['number_features'] = self.number_features # For first batch, pass the pipeline params to the components that need them print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") + if name in self._pipeline_params and self._batch_number == 0: + for param_name, value in self._pipeline_params[name].items(): + if isinstance(value, (Integer, Real)): + # get a random value in the space + component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] + elif isinstance(value, Categorical): + component_parameters[param_name] = value.rvs(random_state=self.random_seed) + else: + component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): if isinstance(value, (Integer, Real)): diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 1483e44465..7ab878e5a0 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -363,6 +363,7 @@ def __init__(self, self._frozen_pipeline_parameters = {} parameters = copy.copy(self.pipeline_parameters) + custom_hyperparameters = copy.copy(self.custom_hyperparameters) if self.problem_configuration: parameters.update({'pipeline': self.problem_configuration}) self._frozen_pipeline_parameters.update({'pipeline': self.problem_configuration}) @@ -386,15 +387,7 @@ def __init__(self, if len(index_columns) > 0 and drop_columns is None: self._frozen_pipeline_parameters['Drop Columns Transformer'] = {'columns': index_columns} parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} - self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, custom_hyperparameters=self.custom_hyperparameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] - else: - for pipeline in self.allowed_pipelines: - if self.pipeline_parameters: - if pipeline.custom_hyperparameters: - for component_name, params in self.pipeline_parameters.items(): - pipeline.custom_hyperparameters[component_name] = params - else: - pipeline.custom_hyperparameters = self.pipeline_parameters + self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") @@ -465,6 +458,7 @@ def __init__(self, pipelines_per_batch=self._pipelines_per_batch, ensembling=run_ensembling, pipeline_params=parameters, + custom_hyperparameters=custom_hyperparameters, _frozen_pipeline_parameters=self._frozen_pipeline_parameters ) diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 1d73db711e..4995cc1cbc 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -46,7 +46,6 @@ def __init__(self, component_graph, parameters=None, custom_name=None, - custom_hyperparameters=None, random_seed=0): """Machine learning pipeline made out of transformers and a estimator. @@ -59,10 +58,8 @@ def __init__(self, parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. Defaults to None. custom_name (str): Custom name for the pipeline. Defaults to None. - custom_hyperparameters (dict): Custom hyperparameter range for the pipeline. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ - self._custom_hyperparameters = custom_hyperparameters self.random_seed = random_seed self.component_graph = component_graph @@ -90,16 +87,6 @@ def __init__(self, self._custom_name = custom_name - @property - def custom_hyperparameters(self): - """Custom hyperparameters for the pipeline.""" - return self._custom_hyperparameters - - @custom_hyperparameters.setter - def custom_hyperparameters(self, value): - """Custom hyperparameters for the pipeline.""" - self._custom_hyperparameters = value - @property def custom_name(self): """Custom name of the pipeline.""" @@ -312,18 +299,6 @@ def model_family(self): final_component = order[-1] return handle_component_class(component_graph[final_component].__class__).model_family - @property - def hyperparameters(self): - """Returns hyperparameter ranges from all components as a dictionary""" - hyperparameter_ranges = dict() - for component_name, component_class in self.linearized_component_graph: - component_hyperparameters = copy.copy(component_class.hyperparameter_ranges) - if self.custom_hyperparameters and component_name in self.custom_hyperparameters: - component_hyperparameters.update(self.custom_hyperparameters.get(component_name, {})) - hyperparameter_ranges[component_name] = component_hyperparameters - print(f"pipeline_base - hyperparameters - hyperparameter_ranges: {hyperparameter_ranges}") - return hyperparameter_ranges - @property def parameters(self): """Parameter dictionary for this pipeline @@ -489,7 +464,7 @@ def clone(self): Returns: A new instance of this pipeline with identical components, parameters, and random state. """ - return self.__class__(self.component_graph, parameters=self.parameters, custom_name=self.custom_name, custom_hyperparameters=self.custom_hyperparameters, random_seed=self.random_seed) + return self.__class__(self.component_graph, parameters=self.parameters, custom_name=self.custom_name, random_seed=self.random_seed) def new(self, parameters, random_seed=0): """Constructs a new instance of the pipeline with the same component graph but with a different set of parameters. @@ -502,7 +477,7 @@ def new(self, parameters, random_seed=0): Returns: A new instance of this pipeline with identical components. """ - return self.__class__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=self.custom_hyperparameters, random_seed=random_seed) + return self.__class__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) def __eq__(self, other): if not isinstance(other, self.__class__): @@ -526,9 +501,6 @@ def repr_component(parameters): component_graph_repr = ", ".join([f"'{component}'" if isinstance(component, str) else component.__name__ for component in self.component_graph]) component_graph_str = f"[{component_graph_repr}]" - custom_hyperparameters_repr = ', '.join([f"'{component}':{{{repr_component(hyperparameters)}}}" for component, hyperparameters in self.custom_hyperparameters.items()]) if self.custom_hyperparameters else None - custom_hyperparmeter_str = f"custom_hyperparameters={{{custom_hyperparameters_repr}}}" if custom_hyperparameters_repr else None - parameters_repr = ', '.join([f"'{component}':{{{repr_component(parameters)}}}" for component, parameters in self.parameters.items()]) parameters_str = f"parameters={{{parameters_repr}}}" From f2a2917879987c2b8b6c26452140b6cf6a2033f3 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Sun, 23 May 2021 08:40:13 -0400 Subject: [PATCH 08/85] more changes --- .../automl/automl_algorithm/automl_algorithm.py | 11 ++++++++++- .../automl_algorithm/iterative_algorithm.py | 16 ++++++++-------- evalml/automl/engine/engine_base.py | 2 -- evalml/pipelines/classification_pipeline.py | 3 +-- evalml/pipelines/pipeline_base.py | 2 +- .../time_series_classification_pipelines.py | 3 +-- .../pipelines/time_series_regression_pipeline.py | 3 +-- evalml/pipelines/utils.py | 9 ++------- 8 files changed, 24 insertions(+), 25 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index fe195bd46c..2d94561eeb 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -14,6 +14,7 @@ class AutoMLAlgorithm(ABC): def __init__(self, allowed_pipelines=None, + custom_hyperparameters=None, max_iterations=None, tuner_class=None, random_seed=0): @@ -33,7 +34,15 @@ def __init__(self, self._tuner_class = tuner_class or SKOptTuner self._tuners = {} for pipeline in self.allowed_pipelines: - self._tuners[pipeline.name] = self._tuner_class(pipeline.hyperparameters, random_seed=self.random_seed) + print(f"AutoMLAlgorithm - init - pipeline: {pipeline}") + pipeline_hyperparameters = dict() + for comp_name in custom_hyperparameters.keys(): + print(f"AutoMLAlgorithm - init - comp_name: {comp_name}") + if comp_name in pipeline.parameters.keys(): + print(f"AutoMLAlgorithm - init - hyperparameter key in pipeline keys: {comp_name}") + pipeline_hyperparameters[comp_name] = custom_hyperparameters[comp_name] + print(f"AutoMLAlgorithm - init - pipeline_hyperparameters: {pipeline_hyperparameters}") + self._tuners[pipeline.name] = self._tuner_class(pipeline_hyperparameters, random_seed=self.random_seed) self._pipeline_number = 0 self._batch_number = 0 diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 58ed13d442..35bd9ed179 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -68,6 +68,7 @@ def __init__(self, allowed_pipelines = pipelines_start + pipelines_end super().__init__(allowed_pipelines=allowed_pipelines, + custom_hyperparameters=custom_hyperparameters, max_iterations=max_iterations, tuner_class=tuner_class, random_seed=random_seed) @@ -177,7 +178,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): print(f"iterativealgorithm - _transform_parameters - pipeline: {pipeline}") print(f"iterativealgorithm - _transform_parameters - pipeline.linearized_component_graph: {pipeline.linearized_component_graph}") print(f"iterativealgorithm - _transform_parameters - pipeline parameters: {pipeline.parameters}") - print(f"iterativealgorithm - _transform_parameters - pipeline hyperparameters: {pipeline.custom_hyperparameters}") component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters print(f"iterativealgorithm - _transform_parameters - init_params: {init_params}") @@ -188,8 +188,10 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters['number_features'] = self.number_features # For first batch, pass the pipeline params to the components that need them print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") - if name in self._pipeline_params and self._batch_number == 0: - for param_name, value in self._pipeline_params[name].items(): + if name in self._custom_hyperparameters and self._batch_number == 0: + print(f"iterativealgorithm - _transform_parameters - hyperparameter name batch 0: {name}") + for param_name, value in self._custom_hyperparameters[name].items(): + print(f"iterativealgorithm - _transform_parameters - param_name name/value: {param_name} - {value}") if isinstance(value, (Integer, Real)): # get a random value in the space component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] @@ -199,11 +201,9 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): - if isinstance(value, (Integer, Real)): - # get a random value in the space - component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] - elif isinstance(value, Categorical): - component_parameters[param_name] = value.rvs(random_state=self.random_seed) + if isinstance(value, (Integer, Real, Categorical)): + raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " + "to custom_hyperparameters instead!") else: component_parameters[param_name] = value if name in self._pipeline_params and name == 'Drop Columns Transformer' and self._batch_number > 0: diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index ce427a3101..a65d0716f4 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -106,7 +106,6 @@ def train_pipeline(pipeline, X, y, optimize_thresholds, objective): """ print(f"engine_base - train_pipeline - pipeline: {pipeline}") print(f"engine_base - train_pipeline - pipeline parameters: {pipeline.parameters}") - print(f"engine_base - train_pipeline - pipeline parameters: {pipeline.hyperparameters}") X_threshold_tuning = None y_threshold_tuning = None if optimize_thresholds and pipeline.can_tune_threshold_with_objective(objective): @@ -134,7 +133,6 @@ def train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train """ print(f"engine_base - train_and_score_pipeline - pipeline: {pipeline}") print(f"engine_base - train_and_score_pipeline - pipeline parameters: {pipeline.parameters}") - print(f"engine_base - train_and_score_pipeline - pipeline parameters: {pipeline.hyperparameters}") start = time.time() cv_data = [] logger.info("\tStarting cross validation") diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index 8dfa4d2560..c3215fd9ea 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -9,12 +9,11 @@ class ClassificationPipeline(PipelineBase): """Pipeline subclass for all classification pipelines.""" - def __init__(self, component_graph, parameters=None, custom_name=None, custom_hyperparameters=None, random_seed=0): + def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0): self._encoder = LabelEncoder() super().__init__(component_graph, custom_name=custom_name, parameters=parameters, - custom_hyperparameters=custom_hyperparameters, random_seed=random_seed) def fit(self, X, y): diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 4995cc1cbc..4d44e0c8f4 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -506,7 +506,7 @@ def repr_component(parameters): custom_name_repr = f"custom_name='{self.custom_name}'" if self.custom_name else None random_seed_str = f"random_seed={self.random_seed}" - additional_args_str = ", ".join([arg for arg in [parameters_str, custom_hyperparmeter_str, custom_name_repr, random_seed_str] if arg is not None]) + additional_args_str = ", ".join([arg for arg in [parameters_str, custom_name_repr, random_seed_str] if arg is not None]) return f'pipeline = {(type(self).__name__)}(component_graph={component_graph_str}, {additional_args_str})' diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 2124d8bddd..2a0cdde3c3 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -20,7 +20,7 @@ class TimeSeriesClassificationPipeline(ClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta): """Pipeline base class for time series classification problems.""" - def __init__(self, component_graph, parameters=None, custom_name=None, custom_hyperparameters=None, random_seed=0): + def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0): """Machine learning pipeline for time series classification problems made out of transformers and a classifier. Arguments: @@ -45,7 +45,6 @@ def __init__(self, component_graph, parameters=None, custom_name=None, custom_hy super().__init__(component_graph, custom_name=custom_name, parameters=parameters, - custom_hyperparameters=custom_hyperparameters, random_seed=random_seed) @staticmethod diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 188feacdb5..30bb801d41 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -16,7 +16,7 @@ class TimeSeriesRegressionPipeline(RegressionPipeline, metaclass=TimeSeriesPipel problem_type = ProblemTypes.TIME_SERIES_REGRESSION - def __init__(self, component_graph, parameters=None, custom_name=None, custom_hyperparameters=None, random_seed=0): + def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0): """Machine learning pipeline for time series regression problems made out of transformers and a classifier. Arguments: @@ -41,7 +41,6 @@ def __init__(self, component_graph, parameters=None, custom_name=None, custom_hy super().__init__(component_graph, custom_name=custom_name, parameters=parameters, - custom_hyperparameters=custom_hyperparameters, random_seed=random_seed) def fit(self, X, y): diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 11977f0f32..c3d963f915 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -128,7 +128,7 @@ def _get_pipeline_base_class(problem_type): return TimeSeriesMulticlassClassificationPipeline -def make_pipeline(X, y, estimator, problem_type, parameters=None, custom_hyperparameters=None, sampler_name=None): +def make_pipeline(X, y, estimator, problem_type, parameters=None, sampler_name=None): """Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. @@ -140,8 +140,6 @@ def make_pipeline(X, y, estimator, problem_type, parameters=None, custom_hyperpa problem_type (ProblemTypes or str): Problem type for pipeline to generate parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. - custom_hyperparameters (dictionary): Dictionary of custom hyperparameters, - with component name as key and dictionary of parameters as the value sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. Defaults to None @@ -160,11 +158,8 @@ def make_pipeline(X, y, estimator, problem_type, parameters=None, custom_hyperpa preprocessing_components = _get_preprocessing_components(X, y, problem_type, estimator, sampler_name) complete_component_graph = preprocessing_components + [estimator] - if custom_hyperparameters and not isinstance(custom_hyperparameters, dict): - raise ValueError(f"if custom_hyperparameters provided, must be dictionary. Received {type(custom_hyperparameters)}") - base_class = _get_pipeline_base_class(problem_type) - return base_class(complete_component_graph, parameters=parameters, custom_hyperparameters=custom_hyperparameters) + return base_class(complete_component_graph, parameters=parameters) def generate_pipeline_code(element): From e74914188a5ba2a3bdffb3bef88ce8b55964f9c3 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 24 May 2021 17:10:35 -0400 Subject: [PATCH 09/85] print update --- evalml/automl/automl_algorithm/iterative_algorithm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 35bd9ed179..5e39737897 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -191,7 +191,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): if name in self._custom_hyperparameters and self._batch_number == 0: print(f"iterativealgorithm - _transform_parameters - hyperparameter name batch 0: {name}") for param_name, value in self._custom_hyperparameters[name].items(): - print(f"iterativealgorithm - _transform_parameters - param_name name/value: {param_name} - {value}") + print(f"iterativealgorithm - _transform_parameters - hyperparam_name name/value: {param_name} - {value}") if isinstance(value, (Integer, Real)): # get a random value in the space component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] @@ -201,6 +201,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): + print(f"iterativealgorithm - _transform_parameters - param_name name/value: {param_name} - {value}") if isinstance(value, (Integer, Real, Categorical)): raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " "to custom_hyperparameters instead!") From 358532c9e857138ca328bdcd6535697dbfcfa581 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 25 May 2021 10:36:41 -0400 Subject: [PATCH 10/85] Set component_parameters to include passed pipeline parameters --- evalml/automl/automl_algorithm/iterative_algorithm.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 5e39737897..4adf02e00e 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -199,6 +199,14 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value.rvs(random_state=self.random_seed) else: component_parameters[param_name] = value + if name in pipeline.parameters and self._batch_number == 0: + for param_name, value in pipeline.parameters[name].items(): + print(f"iterativealgorithm - _transform_parameters - pipeline.parameters name/value: {param_name} - {value}") + if isinstance(value, (Integer, Real, Categorical)): + raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " + "to custom_hyperparameters instead!") + else: + component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): print(f"iterativealgorithm - _transform_parameters - param_name name/value: {param_name} - {value}") From d87aeb3cff6d7744bd58039b020d62c86cbcbc6a Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 25 May 2021 11:30:24 -0400 Subject: [PATCH 11/85] prints --- evalml/automl/automl_algorithm/automl_algorithm.py | 1 - evalml/automl/automl_algorithm/iterative_algorithm.py | 2 ++ evalml/tuners/tuner.py | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 2d94561eeb..3fe61b88c3 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -64,7 +64,6 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """ if pipeline.name not in self._tuners: raise PipelineNotFoundError(f"No such pipeline allowed in this AutoML search: {pipeline.name}") - print(f"automlalgorithm - add_result - pipeline name: {pipeline.name}") print(f"automlalgorithm - add_result - pipeline: {pipeline}") print(f"automlalgorithm - add_result - pipeline parameters: {pipeline.parameters}") self._tuners[pipeline.name].add(pipeline.parameters, score_to_minimize) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 4adf02e00e..3b5eccdbf9 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -150,6 +150,8 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): if pipeline.model_family != ModelFamily.ENSEMBLE: if self.batch_number == 1: try: + print(f'iterative algorithm - add_result - pipeline: {pipeline}') + print(f'iterative algorithm - add_result - trained_pipeline_results: {trained_pipeline_results}') super().add_result(score_to_minimize, pipeline, trained_pipeline_results) except ValueError as e: if 'is not within the bounds of the space' in str(e): diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 08e35abde9..4e57a9beb0 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -41,6 +41,8 @@ def _convert_to_flat_parameters(self, pipeline_parameters): """Convert from pipeline parameters to a flat list of values""" flat_parameter_values = [] for flat_parameter_name in self._search_space_names: + print(f"tuner - _convert_to_flat_parameters - flat_parameter_name: {flat_parameter_name}") + print(f"tuner - _convert_to_flat_parameters - self._parameter_names_map[flat_parameter_name]: {self._parameter_names_map[flat_parameter_name]}") component_name, parameter_name = self._parameter_names_map[flat_parameter_name] if component_name not in pipeline_parameters or parameter_name not in pipeline_parameters[component_name]: raise TypeError('Pipeline parameters missing required field "{}" for component "{}"'.format(parameter_name, component_name)) From 38559e827e16fe9e0381f83cc23f82c6601d4aaf Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 25 May 2021 11:34:03 -0400 Subject: [PATCH 12/85] print --- evalml/tuners/tuner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 4e57a9beb0..5c289f57af 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -40,6 +40,7 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): def _convert_to_flat_parameters(self, pipeline_parameters): """Convert from pipeline parameters to a flat list of values""" flat_parameter_values = [] + print(f"tuner - _convert_to_flat_parameters - self._search_space_names: {self._search_space_names}") for flat_parameter_name in self._search_space_names: print(f"tuner - _convert_to_flat_parameters - flat_parameter_name: {flat_parameter_name}") print(f"tuner - _convert_to_flat_parameters - self._parameter_names_map[flat_parameter_name]: {self._parameter_names_map[flat_parameter_name]}") From 416af9617fd298b254bdcbead0480596144ff367 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 25 May 2021 14:24:43 -0400 Subject: [PATCH 13/85] update get_hyperparameter_ranges --- .../automl_algorithm/automl_algorithm.py | 4 +++- evalml/automl/utils.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 3fe61b88c3..60e2621e0c 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -2,6 +2,7 @@ from evalml.exceptions import PipelineNotFoundError from evalml.tuners import SKOptTuner +from evalml.automl.utils import get_hyperparameter_ranges class AutoMLAlgorithmException(Exception): @@ -42,7 +43,8 @@ def __init__(self, print(f"AutoMLAlgorithm - init - hyperparameter key in pipeline keys: {comp_name}") pipeline_hyperparameters[comp_name] = custom_hyperparameters[comp_name] print(f"AutoMLAlgorithm - init - pipeline_hyperparameters: {pipeline_hyperparameters}") - self._tuners[pipeline.name] = self._tuner_class(pipeline_hyperparameters, random_seed=self.random_seed) + print(f"AutoMLAlgorithm - init - component_graph: {pipeline.component_graph}") + self._tuners[pipeline.name] = self._tuner_class(get_hyperparameter_ranges(pipeline.component_graph, custom_hyperparameters), random_seed=self.random_seed) self._pipeline_number = 0 self._batch_number = 0 diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index b0d69697ce..ab6318842f 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -1,9 +1,11 @@ +import copy from collections import namedtuple import pandas as pd from sklearn.model_selection import KFold, StratifiedKFold from evalml.objectives import get_objective +from evalml.pipelines import ComponentGraph from evalml.preprocessing.data_splitters import ( TimeSeriesSplit, TrainingValidationSplit @@ -151,3 +153,19 @@ def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): return 'SMOTENC Oversampler' except ImportError: return 'Undersampler' + + +def get_hyperparameter_ranges(linearized_component_graph, custom_hyperparameters): + """Returns hyperparameter ranges from all components as a dictionary""" + linearized_component_graph = ComponentGraph.linearized_component_graph(linearized_component_graph) + hyperparameter_ranges = dict() + print(f"utils - get_hyperparameter_ranges - linearized_component_graph: {linearized_component_graph}") + print(f"utils - get_hyperparameter_ranges - custom_hyperparameters: {custom_hyperparameters}") + for component_name, component_class in linearized_component_graph: + component_hyperparameters = copy.copy(component_class.hyperparameter_ranges) + print(f"utils - get_hyperparameter_ranges - component_hyperparameters: {component_name} - {component_hyperparameters}") + if custom_hyperparameters and component_name in custom_hyperparameters: + component_hyperparameters.update(custom_hyperparameters.get(component_name, {})) + hyperparameter_ranges[component_name] = component_hyperparameters + print(f"utils - get_hyperparameter_ranges - hyperparameter_ranges: {hyperparameter_ranges}") + return hyperparameter_ranges From c4c3d98f28988c4282321f2d05c99b73c7976115 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 25 May 2021 15:10:29 -0400 Subject: [PATCH 14/85] print --- evalml/automl/automl_algorithm/automl_algorithm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 60e2621e0c..c17fdfe584 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -36,15 +36,15 @@ def __init__(self, self._tuners = {} for pipeline in self.allowed_pipelines: print(f"AutoMLAlgorithm - init - pipeline: {pipeline}") - pipeline_hyperparameters = dict() + print(f"AutoMLAlgorithm - init - pipeline.parameters: {pipeline.parameters}") + pipeline_hyperparameters = get_hyperparameter_ranges(pipeline.component_graph, custom_hyperparameters) + print(f"AutoMLAlgorithm - init - pipeline.pipeline_hyperparameters: {pipeline_hyperparameters}") for comp_name in custom_hyperparameters.keys(): - print(f"AutoMLAlgorithm - init - comp_name: {comp_name}") if comp_name in pipeline.parameters.keys(): - print(f"AutoMLAlgorithm - init - hyperparameter key in pipeline keys: {comp_name}") - pipeline_hyperparameters[comp_name] = custom_hyperparameters[comp_name] + print(f"AutoMLAlgorithm - init - hyperparameter in pipeline: {comp_name} - {custom_hyperparameters[comp_name]}") + pipeline_hyperparameters[comp_name].update(custom_hyperparameters[comp_name]) print(f"AutoMLAlgorithm - init - pipeline_hyperparameters: {pipeline_hyperparameters}") - print(f"AutoMLAlgorithm - init - component_graph: {pipeline.component_graph}") - self._tuners[pipeline.name] = self._tuner_class(get_hyperparameter_ranges(pipeline.component_graph, custom_hyperparameters), random_seed=self.random_seed) + self._tuners[pipeline.name] = self._tuner_class(pipeline_hyperparameters, random_seed=self.random_seed) self._pipeline_number = 0 self._batch_number = 0 From 8f8dad8f232a1fa21534f0057dd231103259d542 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 26 May 2021 12:25:35 -0400 Subject: [PATCH 15/85] prevent batch 0 parameter selection from being out of bounds of the hyperparameter ranges --- evalml/automl/automl_algorithm/iterative_algorithm.py | 6 +++++- evalml/automl/callbacks.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 3b5eccdbf9..ee050bb82a 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -191,7 +191,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): # For first batch, pass the pipeline params to the components that need them print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") if name in self._custom_hyperparameters and self._batch_number == 0: - print(f"iterativealgorithm - _transform_parameters - hyperparameter name batch 0: {name}") + print(f"iterativealgorithm - _transform_parameters - hyperparameter name: {name}") for param_name, value in self._custom_hyperparameters[name].items(): print(f"iterativealgorithm - _transform_parameters - hyperparam_name name/value: {param_name} - {value}") if isinstance(value, (Integer, Real)): @@ -203,6 +203,10 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in pipeline.parameters and self._batch_number == 0: for param_name, value in pipeline.parameters[name].items(): + if name in self._custom_hyperparameters.keys(): + if param_name in self._custom_hyperparameters[name]: + print(f"iterativealgorithm - _transform_parameters - skipping param_name: {param_name}") + continue print(f"iterativealgorithm - _transform_parameters - pipeline.parameters name/value: {param_name} - {value}") if isinstance(value, (Integer, Real, Categorical)): raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " diff --git a/evalml/automl/callbacks.py b/evalml/automl/callbacks.py index 47186a0caa..d8d9a5e275 100644 --- a/evalml/automl/callbacks.py +++ b/evalml/automl/callbacks.py @@ -29,5 +29,5 @@ def log_error_callback(exception, traceback, automl, **kwargs): logger.info(f"\t\t\tFold {fold_num}: All scores will be replaced with nan.") logger.info(f"\t\t\tFold {fold_num}: Please check {logger.handlers[1].baseFilename} for the current hyperparameters and stack trace.") logger.info(f"\t\t\tFold {fold_num}: Exception during automl search: {str(exception)}") - logger.debug(f"\t\t\tFold {fold_num}: Hyperparameters:\n\t{pipeline.hyperparameters}") + logger.debug(f"\t\t\tFold {fold_num}: Hyperparameters:\n\t{pipeline.parameters}") logger.debug(f"\t\t\tFold {fold_num}: Traceback:\n{trace}") From 4b93ea6889f25bb247c477b5bac65cd6d781ab3d Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 26 May 2021 13:17:33 -0400 Subject: [PATCH 16/85] api updates --- evalml/automl/automl_algorithm/iterative_algorithm.py | 8 ++++++-- evalml/tests/automl_tests/test_automl.py | 5 +++-- evalml/tests/conftest.py | 4 ++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index ee050bb82a..a3593df442 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -186,8 +186,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): # Inspects each component and adds the following parameters when needed if 'n_jobs' in init_params: component_parameters['n_jobs'] = self.n_jobs - if 'number_features' in init_params: - component_parameters['number_features'] = self.number_features + print(f"iterativealgorithm - _transform_parameters - component_parameters 0: {component_parameters}") # For first batch, pass the pipeline params to the components that need them print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") if name in self._custom_hyperparameters and self._batch_number == 0: @@ -212,6 +211,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " "to custom_hyperparameters instead!") else: + component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): @@ -221,6 +221,10 @@ def _transform_parameters(self, pipeline, proposed_parameters): "to custom_hyperparameters instead!") else: component_parameters[param_name] = value + print(f"iterativealgorithm - _transform_parameters - component_parameters 1: {component_parameters}") + if 'number_features' in init_params: + print(f"iterativealgorithm - _transform_parameters - component_parameters 2: {component_parameters}") + component_parameters['number_features'] = self.number_features if name in self._pipeline_params and name == 'Drop Columns Transformer' and self._batch_number > 0: component_parameters['columns'] = self._pipeline_params[name]['columns'] if 'pipeline' in self._pipeline_params: diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 2851511f9a..a7e2baecf7 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1765,8 +1765,9 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors } estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) - pipelines = [make_pipeline(X, y, estimator, 'multiclass', None, custom_hyperparameters) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines, n_jobs=1) + pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines, + custom_hyperparameters=custom_hyperparameters, n_jobs=1) mock_add.side_effect = ValueError("Alternate error that can be thrown") with pytest.raises(ValueError) as error: diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index deacece682..552f68531f 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -391,7 +391,7 @@ class LogisticRegressionMulticlassPipeline(MulticlassClassificationPipeline): component_graph = ['Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) @@ -405,7 +405,7 @@ class LogisticRegressionBinaryPipeline(BinaryClassificationPipeline): custom_name = "Logistic Regression Binary Pipeline" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) From 0e968e5ddcfd90091e799d2a5d73b24df5242978 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 10:04:36 -0400 Subject: [PATCH 17/85] test updates --- .../automl_algorithm/iterative_algorithm.py | 40 +++--- evalml/automl/automl_search.py | 2 + evalml/tests/automl_tests/test_automl.py | 130 ++++++++++-------- evalml/tuners/tuner.py | 3 + 4 files changed, 100 insertions(+), 75 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index a3593df442..0bb72f99eb 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -172,27 +172,28 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): def _transform_parameters(self, pipeline, proposed_parameters): """Given a pipeline parameters dict, make sure n_jobs and number_features are set.""" - print('-------------------------') + print(f"****************************** Batch: {self._batch_number} ******************************") + print(f"iterativealgorithm - _transform_parameters - pipeline: {pipeline}") + print(f"iterativealgorithm - _transform_parameters - pipeline parameters: {pipeline.parameters}") + print(f"iterativealgorithm - _transform_parameters - proposed_parameters: {proposed_parameters}") + print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") parameters = {} if 'pipeline' in self._pipeline_params: parameters['pipeline'] = self._pipeline_params['pipeline'] + print(f"iterativealgorithm - _transform_parameters - parameters['pipeline']: {parameters['pipeline']}") + for name, component_class in pipeline.linearized_component_graph: - print(f"iterativealgorithm - _transform_parameters - pipeline: {pipeline}") - print(f"iterativealgorithm - _transform_parameters - pipeline.linearized_component_graph: {pipeline.linearized_component_graph}") - print(f"iterativealgorithm - _transform_parameters - pipeline parameters: {pipeline.parameters}") + print('-------------------------------------------------------------------------') + print(f"iterativealgorithm - _transform_parameters - component name: {name}") component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters print(f"iterativealgorithm - _transform_parameters - init_params: {init_params}") - # Inspects each component and adds the following parameters when needed - if 'n_jobs' in init_params: - component_parameters['n_jobs'] = self.n_jobs - print(f"iterativealgorithm - _transform_parameters - component_parameters 0: {component_parameters}") # For first batch, pass the pipeline params to the components that need them - print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") + print(f"iterativealgorithm - _transform_parameters - component_parameters - 0: {component_parameters}") if name in self._custom_hyperparameters and self._batch_number == 0: print(f"iterativealgorithm - _transform_parameters - hyperparameter name: {name}") for param_name, value in self._custom_hyperparameters[name].items(): - print(f"iterativealgorithm - _transform_parameters - hyperparam_name name/value: {param_name} - {value}") + print(f"iterativealgorithm - _transform_parameters - hyperparameter_name name/value: {param_name} - {value}") if isinstance(value, (Integer, Real)): # get a random value in the space component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] @@ -200,12 +201,9 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value.rvs(random_state=self.random_seed) else: component_parameters[param_name] = value + print(f"iterativealgorithm - _transform_parameters - component_parameters - 1: {component_parameters}") if name in pipeline.parameters and self._batch_number == 0: for param_name, value in pipeline.parameters[name].items(): - if name in self._custom_hyperparameters.keys(): - if param_name in self._custom_hyperparameters[name]: - print(f"iterativealgorithm - _transform_parameters - skipping param_name: {param_name}") - continue print(f"iterativealgorithm - _transform_parameters - pipeline.parameters name/value: {param_name} - {value}") if isinstance(value, (Integer, Real, Categorical)): raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " @@ -213,26 +211,30 @@ def _transform_parameters(self, pipeline, proposed_parameters): else: component_parameters[param_name] = value + print(f"iterativealgorithm - _transform_parameters - component_parameters - 2: {component_parameters}") if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): - print(f"iterativealgorithm - _transform_parameters - param_name name/value: {param_name} - {value}") + print(f"iterativealgorithm - _transform_parameters - self._pipeline_params name/value: {param_name} - {value}") if isinstance(value, (Integer, Real, Categorical)): raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " "to custom_hyperparameters instead!") else: component_parameters[param_name] = value - print(f"iterativealgorithm - _transform_parameters - component_parameters 1: {component_parameters}") + print(f"iterativealgorithm - _transform_parameters - component_parameters - 3: {component_parameters}") + # Inspects each component and adds the following parameters when needed + if 'n_jobs' in init_params: + component_parameters['n_jobs'] = self.n_jobs if 'number_features' in init_params: - print(f"iterativealgorithm - _transform_parameters - component_parameters 2: {component_parameters}") component_parameters['number_features'] = self.number_features if name in self._pipeline_params and name == 'Drop Columns Transformer' and self._batch_number > 0: component_parameters['columns'] = self._pipeline_params[name]['columns'] if 'pipeline' in self._pipeline_params: + print(f"iterativealgorithm - _transform_parameters - self._pipeline_params end: {self._pipeline_params}") for param_name, value in self._pipeline_params['pipeline'].items(): + print(f"iterativealgorithm - _transform_parameters - self._pipeline_params['pipeline'] name/value: {param_name} - {value}") if param_name in init_params: component_parameters[param_name] = value - print(f"iterativealgorithm - _transform_parameters - component_parameters: {component_parameters}") + print(f"iterativealgorithm - _transform_parameters - component_parameters - 4: {component_parameters}") parameters[name] = component_parameters print(f"iterativealgorithm - _transform_parameters - parameters: {parameters}") - print('-------------------------') return parameters diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 7ab878e5a0..c874600290 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -363,6 +363,7 @@ def __init__(self, self._frozen_pipeline_parameters = {} parameters = copy.copy(self.pipeline_parameters) + print(f"automl_search - init - parameters: {parameters}") custom_hyperparameters = copy.copy(self.custom_hyperparameters) if self.problem_configuration: parameters.update({'pipeline': self.problem_configuration}) @@ -447,6 +448,7 @@ def __init__(self, for pipe_ in self.allowed_pipelines: print(f"automlsearch - init - pre iterative algorithm pipeline parameters: {pipe_.parameters}") + print(f"automlsearch - init - parameters: {parameters}") self._automl_algorithm = IterativeAlgorithm( max_iterations=self.max_iterations, diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index a7e2baecf7..1cac0aab97 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1796,20 +1796,41 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_errors(mock_ } estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) - invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None, invalid_custom_hyperparameters) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=invalid_pipelines) + invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=invalid_pipelines, custom_hyperparameters=invalid_custom_hyperparameters) with pytest.raises(ValueError, match="Default parameters for components"): automl.search() - invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None, larger_invalid) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=invalid_pipelines) + invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=invalid_pipelines, custom_hyperparameters=larger_invalid) with pytest.raises(ValueError, match="Default parameters for components"): automl.search() - +''' +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST +TURN THIS INTO A MASSIVE GRID TEST + +''' @patch('evalml.pipelines.BinaryClassificationPipeline.score') @patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline(mock_fit, mock_score, X_y_multi): +def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_fit, mock_score, X_y_multi): X, y = X_y_multi custom_hyperparameters = { "Imputer": { @@ -1826,17 +1847,17 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline(mock_fit, mo } } estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) - pipelines = [make_pipeline(X, y, estimator, 'multiclass', None, custom_hyperparameters) for estimator in estimators] + pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines, custom_hyperparameters=custom_hyperparameters) automl.search() - assert automl.best_pipeline.hyperparameters['Imputer']['numeric_impute_strategy'] == ["mean"] + assert automl.best_pipeline.parameters['Imputer']['numeric_impute_strategy'] == "mean" - invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None, larger_custom) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=invalid_pipelines) + invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', max_batches=2, allowed_pipelines=invalid_pipelines, custom_hyperparameters=larger_custom) automl.search() - - assert automl.best_pipeline.hyperparameters['Imputer']['numeric_impute_strategy'] == ["most_frequent", "mean"] + for params in automl.full_rankings['parameters'].values[:-1]: + assert params['Imputer']['numeric_impute_strategy'] in larger_custom['Imputer']['numeric_impute_strategy'] @patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.6}) @@ -2219,7 +2240,7 @@ def test_automl_pipeline_params_simple(mock_fit, mock_score, X_y_binary): assert row['parameters']['Elastic Net Classifier']['l1_ratio'] == 0.2 -@patch('evalml.pipelines.RegressionPipeline.fit') +'''@patch('evalml.pipelines.RegressionPipeline.fit') @patch('evalml.pipelines.RegressionPipeline.score') def test_automl_pipeline_params_multiple(mock_score, mock_fit, X_y_regression): mock_score.return_value = {'R2': 1.0} @@ -2237,7 +2258,7 @@ def test_automl_pipeline_params_multiple(mock_score, mock_fit, X_y_regression): assert row['parameters']['Decision Tree Regressor']['max_features'] == 'auto' if 'Elastic Net Regressor' in row['parameters']: assert 0 < row['parameters']['Elastic Net Regressor']['alpha'] < 0.5 - assert row['parameters']['Elastic Net Regressor']['l1_ratio'] == Categorical((0.01, 0.02, 0.03)).rvs(random_state=automl.random_seed) + assert row['parameters']['Elastic Net Regressor']['l1_ratio'] == Categorical((0.01, 0.02, 0.03)).rvs(random_state=automl.random_seed)''' @patch('evalml.pipelines.BinaryClassificationPipeline.fit') @@ -2253,15 +2274,13 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components(mock_scor component_graph_linear = ["Imputer", "Imputer", "Random Forest Classifier"] pipeline_linear = BinaryClassificationPipeline(component_graph_linear) automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_dict, pipeline_linear], - pipeline_parameters={"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["median"])}}, + pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}, + "Imputer_1": {"numeric_impute_strategy": "median"}}, max_batches=3) automl.search() - for i, row in automl.full_rankings.iterrows(): - if "Mode Baseline Binary" in row['pipeline_name']: - continue - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] == "most_frequent" - assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] == "median" + for row in automl.full_rankings.iloc[1:3].parameters: + assert row["Imputer"]["numeric_impute_strategy"] == "most_frequent" + assert row["Imputer_1"]["numeric_impute_strategy"] == "median" component_graph_dict = {"One Hot Encoder": ["One Hot Encoder"], "One Hot Encoder_1": ["One Hot Encoder", "One Hot Encoder"], @@ -2272,38 +2291,38 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components(mock_scor pipeline_linear = BinaryClassificationPipeline(component_graph_linear) automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_linear, pipeline_dict], - pipeline_parameters={"One Hot Encoder": {"top_n": Categorical([15])}, - "One Hot Encoder_1": {"top_n": Categorical([25])}}, + pipeline_parameters={"One Hot Encoder": {"top_n": 15}, + "One Hot Encoder_1": {"top_n": 25}}, max_batches=3) automl.search() - for i, row in automl.full_rankings.iterrows(): - if "Mode Baseline Binary" in row['pipeline_name']: - continue - assert row["parameters"]["One Hot Encoder"]["top_n"] == 15 - assert row["parameters"]["One Hot Encoder_1"]["top_n"] == 25 + for row in automl.full_rankings.iloc[1:3].parameters: + assert row["One Hot Encoder"]["top_n"] == 15 + assert row["One Hot Encoder_1"]["top_n"] == 25 +@pytest.mark.parametrize('graph_type', ['linear', 'dict']) @patch('evalml.pipelines.BinaryClassificationPipeline.fit') @patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.02}) -def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_components(mock_score, mock_fit, X_y_binary): +def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_components(mock_score, mock_fit, graph_type, X_y_binary): X, y = X_y_binary - custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["median", 'mean'])}, - "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}} - component_graph_dict = {"Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"]} - pipeline_dict = BinaryClassificationPipeline(component_graph_dict, custom_name="Pipeline from dict", custom_hyperparameters=custom_hyperparameters) - - custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["mean"])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}} - component_graph_linear = ["Imputer", "Imputer", "Random Forest Classifier"] - pipeline_linear = BinaryClassificationPipeline(component_graph_linear) + if graph_type == 'linear': + custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["mean"])}, + "Imputer_1": { + "numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, + "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}} + component_graph = ["Imputer", "Imputer", "Random Forest Classifier"] + pipeline_ = BinaryClassificationPipeline(component_graph) + else: + custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, + "Imputer_1": {"numeric_impute_strategy": Categorical(["median", 'mean'])}, + "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}} + component_graph = {"Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"]} + pipeline_ = BinaryClassificationPipeline(component_graph, custom_name="Pipeline from dict") - automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_dict, pipeline_linear], - max_batches=5) + automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_], custom_hyperparameters=custom_hyperparameters, max_batches=5) automl.search() for i, row in automl.full_rankings.iterrows(): if "Mode Baseline Binary" in row['pipeline_name']: @@ -2323,33 +2342,32 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams(mock_score, mock_fit, X_y_binary): X, y = X_y_binary - # Pass the input of the first imputer to the second imputer - custom_hyperparameters = {"One Hot Encoder": {"top_n": Categorical([5, 10])}} - component_graph = {"Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"]} - pipeline_one = BinaryClassificationPipeline(component_graph, custom_name="Pipe Line One", custom_hyperparameters=custom_hyperparameters) + pipeline_one = BinaryClassificationPipeline(component_graph, custom_name="Pipe Line One") pipeline_two = BinaryClassificationPipeline(["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], - custom_name="Pipe Line Two", - custom_hyperparameters={"One Hot Encoder": {"top_n": Categorical([12, 10])}}) + custom_name="Pipe Line Two") pipeline_three = BinaryClassificationPipeline(["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], - custom_name="Pipe Line Three", - custom_hyperparameters={"Imputer": {"numeric_imputer_strategy": Categorical(["median"])}}) + custom_name="Pipe Line Three") automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_one, pipeline_two, pipeline_three], - pipeline_parameters={"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])}}, + pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}}, + custom_hyperparameters={"One Hot Encoder": {"top_n": Categorical([12, 10])}, + "Imputer": {"numeric_impute_strategy": Categorical(["median"])}}, max_batches=4) automl.search() - + from pprint import pp + print(automl.full_rankings) + for pipe in automl.full_rankings.parameters: + pp(pipe) expected_top_n = {"Pipe Line One": {5, 10}, "Pipe Line Two": {12, 10}, "Pipe Line Three": {10}} for i, row in automl.full_rankings.iterrows(): if "Mode Baseline Binary" in row['pipeline_name']: continue assert row["parameters"]["Imputer"]["numeric_impute_strategy"] == "most_frequent" - assert row["parameters"]["One Hot Encoder"]["top_n"] in expected_top_n[row["pipeline_name"]] assert any(row['parameters']["One Hot Encoder"]["top_n"] == 12 for _, row in automl.full_rankings.iterrows() if row["pipeline_name"] == "Pipe Line Two") assert any(row['parameters']["One Hot Encoder"]["top_n"] == 5 for _, row in automl.full_rankings.iterrows() if row["pipeline_name"] == "Pipe Line One") diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 5c289f57af..26a47a72c3 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -36,6 +36,9 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): self._parameter_names_map[flat_parameter_name] = (component_name, parameter_name) self._search_space_names.append(flat_parameter_name) self._search_space_ranges.append(parameter_range) + print(f"Tuner - __init__ - self._parameter_names_map: {self._parameter_names_map}") + print(f"Tuner - __init__ - self._search_space_names: {self._search_space_names}") + print(f"Tuner - __init__ - self.self._search_space_ranges: {self._search_space_ranges}") def _convert_to_flat_parameters(self, pipeline_parameters): """Convert from pipeline parameters to a flat list of values""" From 27f048d3550b716d8e7728c04e7bf04c2a951fe4 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 11:17:55 -0400 Subject: [PATCH 18/85] add changes to pipelines passed in --- evalml/automl/automl_search.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index c874600290..4bf0eaa0c9 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -389,6 +389,14 @@ def __init__(self, self._frozen_pipeline_parameters['Drop Columns Transformer'] = {'columns': index_columns} parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] + else: + for pipeline in self.allowed_pipelines: + if self.pipeline_parameters: + if self.custom_hyperparameters: + for component_name, params in self.custom_hyperparameters.items(): + pipeline.custom_hyperparameters[component_name] = params + else: + pipeline.custom_hyperparameters = self.pipeline_parameters if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") From 7bcec95a6e2f280c4ee97f40c12e481665bff7cd Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 13:51:44 -0400 Subject: [PATCH 19/85] practice tests --- evalml/automl/automl_search.py | 17 +++---- evalml/tests/automl_tests/test_automl.py | 64 +++++++++++++++++++++++- 2 files changed, 70 insertions(+), 11 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 4bf0eaa0c9..841df869ec 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -387,16 +387,15 @@ def __init__(self, index_columns = list(self.X_train.select('index').columns) if len(index_columns) > 0 and drop_columns is None: self._frozen_pipeline_parameters['Drop Columns Transformer'] = {'columns': index_columns} - parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} - self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] + parameters['Drop Columns Transformer'] = {'columns': index_columns} + self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] else: - for pipeline in self.allowed_pipelines: - if self.pipeline_parameters: - if self.custom_hyperparameters: - for component_name, params in self.custom_hyperparameters.items(): - pipeline.custom_hyperparameters[component_name] = params - else: - pipeline.custom_hyperparameters = self.pipeline_parameters + if self.pipeline_parameters: + if self.custom_hyperparameters: + for component_name, params in self.custom_hyperparameters.items(): + self.custom_hyperparameters = params + else: + self.custom_hyperparameters = self.pipeline_parameters if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 1cac0aab97..1236e75dc6 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2377,9 +2377,9 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams(mock_sco def test_automl_pipeline_params_kwargs(mock_fit, mock_score, X_y_multi): mock_score.return_value = {'Log Loss Multiclass': 1.0} X, y = X_y_multi - params = {'Imputer': {'numeric_impute_strategy': Categorical(['most_frequent'])}, + hyperparams = {'Imputer': {'numeric_impute_strategy': Categorical(['most_frequent'])}, 'Decision Tree Classifier': {'max_depth': Integer(1, 2), 'ccp_alpha': Real(0.1, 0.5)}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', pipeline_parameters=params, + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', custom_hyperparameters=hyperparams, allowed_model_families=[ModelFamily.DECISION_TREE], n_jobs=1) automl.search() for i, row in automl.rankings.iterrows(): @@ -2390,6 +2390,66 @@ def test_automl_pipeline_params_kwargs(mock_fit, mock_score, X_y_multi): assert row['parameters']['Decision Tree Classifier']['max_depth'] == 1 +#@patch('evalml.pipelines.BinaryClassificationPipeline.score') +#@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +def test_pipelines_true_true_true(X_y_binary): + ''' + The numeric_impute_strategy: most_frequent parameter of Imputer won't be found in the hyperparameter ranges. + The tuner will look for most_frequent but only find mean. + Solution is to ignore conflicting hyperparameters + ''' + X, y = X_y_binary + + component_graph = ['Imputer', 'Random Forest Classifier'] + parameters = { + "Imputer": {'numeric_impute_strategy': 'most_frequent'}, + "Random Forest Classifier": {'n_estimators': 200, + "max_depth": 11} + } + custom_hyperparameters = { + "Random Forest Classifier": {"max_depth": Integer(11, 12)} + } + + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) + + automl = AutoMLSearch(X, y, problem_type="binary", + max_batches=3, allowed_pipelines=[pipeline_], + custom_hyperparameters=custom_hyperparameters) + automl.search() + + +def test_pipelines_true_true_false(): + pass + + +def test_pipelines_true_false_true(): + pass + + +def test_pipelines_true_false_false(): + pass + + +def test_pipelines_false_true_true(): + pass + + +def test_pipelines_false_true_false(): + pass + + +def test_pipelines_false_false_true(): + pass + + +def test_pipelines_false_false_false(): + pass + + + + + + @pytest.mark.parametrize("random_seed", [0, 1, 9]) @patch('evalml.pipelines.MulticlassClassificationPipeline.score') @patch('evalml.pipelines.MulticlassClassificationPipeline.fit') From a62361ccd8246add5e529ec6b8c6013470972d01 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 15:12:39 -0400 Subject: [PATCH 20/85] test updates --- evalml/automl/automl_search.py | 7 ----- evalml/tests/automl_tests/test_automl.py | 35 ++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 841df869ec..90dd10b2c9 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -389,13 +389,6 @@ def __init__(self, self._frozen_pipeline_parameters['Drop Columns Transformer'] = {'columns': index_columns} parameters['Drop Columns Transformer'] = {'columns': index_columns} self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] - else: - if self.pipeline_parameters: - if self.custom_hyperparameters: - for component_name, params in self.custom_hyperparameters.items(): - self.custom_hyperparameters = params - else: - self.custom_hyperparameters = self.pipeline_parameters if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 1236e75dc6..2152138667 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -3,6 +3,7 @@ from collections import OrderedDict from itertools import product from unittest.mock import MagicMock, PropertyMock, patch +from pprint import pp import cloudpickle import numpy as np @@ -2409,17 +2410,45 @@ def test_pipelines_true_true_true(X_y_binary): custom_hyperparameters = { "Random Forest Classifier": {"max_depth": Integer(11, 12)} } + pipeine_parameters = { + "Random Forest Classifier": {'n_estimators': 222} + } pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_], + max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeine_parameters, custom_hyperparameters=custom_hyperparameters) automl.search() + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) -def test_pipelines_true_true_false(): - pass + + +def test_pipelines_true_true_false(X_y_binary): + X, y = X_y_binary + + component_graph = ['Imputer', 'Random Forest Classifier'] + parameters = { + "Imputer": {'numeric_impute_strategy': 'most_frequent'}, + "Random Forest Classifier": {'n_estimators': 200, + "max_depth": 11} + } + pipeine_parameters = { + "Random Forest Classifier": {'n_estimators': 222} + } + + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) + + automl = AutoMLSearch(X, y, problem_type="binary", pipeline_parameters=pipeine_parameters, + max_batches=3, allowed_pipelines=[pipeline_]) + automl.search() + + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) def test_pipelines_true_false_true(): From 4937680a44fc9526b3899f97484adfe67aa729e2 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 16:30:03 -0400 Subject: [PATCH 21/85] print update --- evalml/tests/automl_tests/test_automl.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 2152138667..3d1e27bfc8 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2428,6 +2428,12 @@ def test_pipelines_true_true_true(X_y_binary): def test_pipelines_true_true_false(X_y_binary): + ''' + The parameters passed in pipeline_parameters are set for the first iteration but the remaining default + to the parameter in the estimator, here from 222 to 100. + The problem is that + Solution + ''' X, y = X_y_binary component_graph = ['Imputer', 'Random Forest Classifier'] From 4b28e592536037d46d281e0d95bc0decc02a6cb1 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 16:30:13 -0400 Subject: [PATCH 22/85] print update --- evalml/automl/automl_algorithm/iterative_algorithm.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 0bb72f99eb..c8968d71e6 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -189,7 +189,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): init_params = inspect.signature(component_class.__init__).parameters print(f"iterativealgorithm - _transform_parameters - init_params: {init_params}") # For first batch, pass the pipeline params to the components that need them - print(f"iterativealgorithm - _transform_parameters - component_parameters - 0: {component_parameters}") + print(f"iterativealgorithm - _transform_parameters - component_parameters: {component_parameters}") if name in self._custom_hyperparameters and self._batch_number == 0: print(f"iterativealgorithm - _transform_parameters - hyperparameter name: {name}") for param_name, value in self._custom_hyperparameters[name].items(): @@ -201,7 +201,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value.rvs(random_state=self.random_seed) else: component_parameters[param_name] = value - print(f"iterativealgorithm - _transform_parameters - component_parameters - 1: {component_parameters}") if name in pipeline.parameters and self._batch_number == 0: for param_name, value in pipeline.parameters[name].items(): print(f"iterativealgorithm - _transform_parameters - pipeline.parameters name/value: {param_name} - {value}") @@ -211,7 +210,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): else: component_parameters[param_name] = value - print(f"iterativealgorithm - _transform_parameters - component_parameters - 2: {component_parameters}") if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): print(f"iterativealgorithm - _transform_parameters - self._pipeline_params name/value: {param_name} - {value}") @@ -220,7 +218,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): "to custom_hyperparameters instead!") else: component_parameters[param_name] = value - print(f"iterativealgorithm - _transform_parameters - component_parameters - 3: {component_parameters}") # Inspects each component and adds the following parameters when needed if 'n_jobs' in init_params: component_parameters['n_jobs'] = self.n_jobs @@ -234,7 +231,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): print(f"iterativealgorithm - _transform_parameters - self._pipeline_params['pipeline'] name/value: {param_name} - {value}") if param_name in init_params: component_parameters[param_name] = value - print(f"iterativealgorithm - _transform_parameters - component_parameters - 4: {component_parameters}") + print(f"iterativealgorithm - _transform_parameters - component_parameters: {component_parameters}") parameters[name] = component_parameters print(f"iterativealgorithm - _transform_parameters - parameters: {parameters}") return parameters From 98b487097874a298470144e218309094c8abdd95 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 16:41:51 -0400 Subject: [PATCH 23/85] pass frozen parameters --- evalml/automl/automl_search.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 90dd10b2c9..0a50cc3e96 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -363,8 +363,8 @@ def __init__(self, self._frozen_pipeline_parameters = {} parameters = copy.copy(self.pipeline_parameters) - print(f"automl_search - init - parameters: {parameters}") custom_hyperparameters = copy.copy(self.custom_hyperparameters) + if self.problem_configuration: parameters.update({'pipeline': self.problem_configuration}) self._frozen_pipeline_parameters.update({'pipeline': self.problem_configuration}) @@ -372,12 +372,12 @@ def __init__(self, self.sampler_method = sampler_method self.sampler_balanced_ratio = sampler_balanced_ratio self._sampler_name = None + if is_classification(self.problem_type): self._sampler_name = self.sampler_method if self.sampler_method in ['auto', 'Oversampler']: self._sampler_name = get_best_sampler_for_data(self.X_train, self.y_train, self.sampler_method, self.sampler_balanced_ratio) self._frozen_pipeline_parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} - parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} if self.allowed_pipelines is None: logger.info("Generating pipelines to search over...") @@ -387,8 +387,7 @@ def __init__(self, index_columns = list(self.X_train.select('index').columns) if len(index_columns) > 0 and drop_columns is None: self._frozen_pipeline_parameters['Drop Columns Transformer'] = {'columns': index_columns} - parameters['Drop Columns Transformer'] = {'columns': index_columns} - self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] + self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") From 1698b9b66e970a9fb208d9caf339ac1d3f268883 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 18:34:42 -0400 Subject: [PATCH 24/85] upadte tests --- .../automl_algorithm/iterative_algorithm.py | 9 ------- evalml/tests/automl_tests/test_automl.py | 27 +++++++++++++++++-- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index c8968d71e6..44f716398c 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -200,15 +200,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): elif isinstance(value, Categorical): component_parameters[param_name] = value.rvs(random_state=self.random_seed) else: - component_parameters[param_name] = value - if name in pipeline.parameters and self._batch_number == 0: - for param_name, value in pipeline.parameters[name].items(): - print(f"iterativealgorithm - _transform_parameters - pipeline.parameters name/value: {param_name} - {value}") - if isinstance(value, (Integer, Real, Categorical)): - raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " - "to custom_hyperparameters instead!") - else: - component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 3d1e27bfc8..2190163cad 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2457,8 +2457,31 @@ def test_pipelines_true_true_false(X_y_binary): pp(pipeline) -def test_pipelines_true_false_true(): - pass +def test_pipelines_true_false_true(X_y_binary): + ''' + + ''' + X, y = X_y_binary + + component_graph = ['Imputer', 'Random Forest Classifier'] + parameters = { + "Imputer": {'numeric_impute_strategy': 'most_frequent'}, + "Random Forest Classifier": {'n_estimators': 200, + "max_depth": 11} + } + custom_hyperparameters = { + "Random Forest Classifier": {"max_depth": Integer(11, 12)} + } + + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) + + automl = AutoMLSearch(X, y, problem_type="binary", custom_hyperparameters=custom_hyperparameters, + max_batches=3, allowed_pipelines=[pipeline_]) + automl.search() + + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) def test_pipelines_true_false_false(): From 8f7be8c227aad787f89e042d8d0e25b66062503d Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 19:56:25 -0400 Subject: [PATCH 25/85] test update --- evalml/tests/automl_tests/test_automl.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 2190163cad..3a135078d8 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2458,9 +2458,6 @@ def test_pipelines_true_true_false(X_y_binary): def test_pipelines_true_false_true(X_y_binary): - ''' - - ''' X, y = X_y_binary component_graph = ['Imputer', 'Random Forest Classifier'] @@ -2484,8 +2481,25 @@ def test_pipelines_true_false_true(X_y_binary): pp(pipeline) -def test_pipelines_true_false_false(): - pass +def test_pipelines_true_false_false(X_y_binary): + X, y = X_y_binary + + component_graph = ['Imputer', 'Random Forest Classifier'] + parameters = { + "Imputer": {'numeric_impute_strategy': 'most_frequent'}, + "Random Forest Classifier": {'n_estimators': 200, + "max_depth": 11} + } + + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) + + automl = AutoMLSearch(X, y, problem_type="binary", + max_batches=3, allowed_pipelines=[pipeline_]) + automl.search() + + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) def test_pipelines_false_true_true(): From da01d26b6b1cd6618c85717aac1c57630c5d8be2 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 27 May 2021 20:05:45 -0400 Subject: [PATCH 26/85] test update --- evalml/tests/automl_tests/test_automl.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 3a135078d8..02d29022d9 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2502,8 +2502,27 @@ def test_pipelines_true_false_false(X_y_binary): pp(pipeline) -def test_pipelines_false_true_true(): - pass +def test_pipelines_false_true_true(X_y_binary): + X, y = X_y_binary + + component_graph = ['Imputer', 'Random Forest Classifier'] + custom_hyperparameters = { + "Random Forest Classifier": {"max_depth": Integer(11, 12)} + } + pipeine_parameters = { + "Random Forest Classifier": {'n_estimators': 222} + } + + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) + + automl = AutoMLSearch(X, y, problem_type="binary", + max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeine_parameters, + custom_hyperparameters=custom_hyperparameters) + automl.search() + + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) def test_pipelines_false_true_false(): From 8c84504c7e24cab33a8f6b8f138033ef5f10abcd Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 28 May 2021 19:05:44 -0400 Subject: [PATCH 27/85] test update --- evalml/tests/automl_tests/test_automl.py | 58 ++++++++++++++++++++---- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 02d29022d9..db7ed07da3 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2509,14 +2509,14 @@ def test_pipelines_false_true_true(X_y_binary): custom_hyperparameters = { "Random Forest Classifier": {"max_depth": Integer(11, 12)} } - pipeine_parameters = { + pipeline_parameters = { "Random Forest Classifier": {'n_estimators': 222} } pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeine_parameters, + max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeline_parameters, custom_hyperparameters=custom_hyperparameters) automl.search() @@ -2525,19 +2525,61 @@ def test_pipelines_false_true_true(X_y_binary): pp(pipeline) -def test_pipelines_false_true_false(): - pass +@patch('evalml.pipelines.BinaryClassificationPipeline.score') +@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +def test_pipelines_false_true_false(mock_fit, mock_score, X_y_binary): + X, y = X_y_binary + component_graph = ['Imputer', 'Random Forest Classifier'] + pipeline_parameters = { + "Random Forest Classifier": {'n_estimators': 222} + } + + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) + + automl = AutoMLSearch(X, y, problem_type="binary", + max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeline_parameters) + automl.search() -def test_pipelines_false_false_true(): - pass + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) -def test_pipelines_false_false_false(): - pass +def test_pipelines_false_false_true(X_y_binary): + X, y = X_y_binary + component_graph = ['Imputer', 'Random Forest Classifier'] + custom_hyperparameters = { + "Random Forest Classifier": {"max_depth": Integer(11, 12)} + } + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) + automl = AutoMLSearch(X, y, problem_type="binary", + max_batches=3, allowed_pipelines=[pipeline_], + custom_hyperparameters=custom_hyperparameters) + automl.search() + + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) + + +def test_pipelines_false_false_false(X_y_binary): + X, y = X_y_binary + + component_graph = ['Imputer', 'Random Forest Classifier'] + + pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) + + automl = AutoMLSearch(X, y, problem_type="binary", + max_batches=3, allowed_pipelines=[pipeline_]) + automl.search() + + print(automl.full_rankings) + for pipeline in automl.full_rankings.parameters: + pp(pipeline) From 21cc485b36f76b99ea151966edcc04c987482aaf Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 28 May 2021 19:24:06 -0400 Subject: [PATCH 28/85] print statements --- evalml/pipelines/component_graph.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 97a148addf..2fbdeb92f1 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -97,12 +97,16 @@ def instantiate(self, parameters): raise ValueError(f"Cannot reinstantiate a component graph that was previously instantiated") parameters = parameters or {} + print(f'component_graph - instantiate - parameters: {parameters}') self._is_instantiated = True component_instances = {} for component_name, component_class in self.component_instances.items(): + print(f'component_graph - instantiate - component_name/component_class: {component_name} - {component_class}') component_parameters = parameters.get(component_name, {}) + print(f'component_graph - instantiate - component_parameters: {component_parameters}') try: new_component = component_class(**component_parameters, random_seed=self.random_seed) + print(f'component_graph - instantiate - new_component: {new_component}') except (ValueError, TypeError) as e: self._is_instantiated = False err = "Error received when instantiating component {} with the following arguments {}".format(component_name, component_parameters) From 4adaef731ed4839a08d489e0ef23ff9eca793e4a Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Sat, 29 May 2021 13:42:02 -0400 Subject: [PATCH 29/85] test update --- evalml/tests/automl_tests/test_automl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index db7ed07da3..b4e8c1a041 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1761,7 +1761,7 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors X, y = X_y_multi custom_hyperparameters = { "Imputer": { - "numeric_impute_strategy": ["most_frequent", "mean"] + "numeric_impute_strategy": Categorical(["most_frequent", "mean"]) } } estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) From b7aae184820a02baa475398f2008eb959110d3b2 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Sun, 30 May 2021 10:20:10 -0400 Subject: [PATCH 30/85] test updates --- .../automl_algorithm/iterative_algorithm.py | 1 + evalml/tests/automl_tests/test_automl.py | 45 ++++--------------- 2 files changed, 10 insertions(+), 36 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 44f716398c..967add9b0a 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -215,6 +215,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): if 'number_features' in init_params: component_parameters['number_features'] = self.number_features if name in self._pipeline_params and name == 'Drop Columns Transformer' and self._batch_number > 0: + print(f"iterativealgorithm - _transform_parameters - Drop Columns Transformer: {self._pipeline_params[name]}") component_parameters['columns'] = self._pipeline_params[name]['columns'] if 'pipeline' in self._pipeline_params: print(f"iterativealgorithm - _transform_parameters - self._pipeline_params end: {self._pipeline_params}") diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index b4e8c1a041..bc03b049fe 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1777,36 +1777,6 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors assert "Default parameters for components" not in str(error.value) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_errors(mock_fit, mock_score, X_y_multi): - X, y = X_y_multi - invalid_custom_hyperparameters = { - "Imputer": { - "numeric_impute_strategy": ["most_frequent", "median"] - } - } - larger_invalid = { - "Imputer": { - "numeric_impute_strategy": ["most_frequent", "mean"] - }, - "Extra Trees Classifier": { - "max_depth": [4, 5, 6, 7], - "max_features": ["sqrt", "log2"] - } - } - estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) - - invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=invalid_pipelines, custom_hyperparameters=invalid_custom_hyperparameters) - with pytest.raises(ValueError, match="Default parameters for components"): - automl.search() - - invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=invalid_pipelines, custom_hyperparameters=larger_invalid) - with pytest.raises(ValueError, match="Default parameters for components"): - automl.search() - ''' TURN THIS INTO A MASSIVE GRID TEST TURN THIS INTO A MASSIVE GRID TEST @@ -2226,8 +2196,10 @@ def test_automl_pipeline_params_simple(mock_fit, mock_score, X_y_binary): mock_score.return_value = {'Log Loss Binary': 1.0} X, y = X_y_binary params = {"Imputer": {"numeric_impute_strategy": "most_frequent"}, - "Logistic Regression Classifier": {"C": 20, "penalty": 'none'}, - "Elastic Net Classifier": {"alpha": 0.75, "l1_ratio": 0.2}} + "Logistic Regression Classifier": {"C": 20, + "penalty": 'none'}, + "Elastic Net Classifier": {"alpha": 0.75, + "l1_ratio": 0.2}} automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", pipeline_parameters=params, n_jobs=1) automl.search() for i, row in automl.rankings.iterrows(): @@ -2439,7 +2411,7 @@ def test_pipelines_true_true_false(X_y_binary): component_graph = ['Imputer', 'Random Forest Classifier'] parameters = { "Imputer": {'numeric_impute_strategy': 'most_frequent'}, - "Random Forest Classifier": {'n_estimators': 200, + "Random Forest Classifier": {'n_estimators': 222, "max_depth": 11} } pipeine_parameters = { @@ -2448,7 +2420,7 @@ def test_pipelines_true_true_false(X_y_binary): pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) - automl = AutoMLSearch(X, y, problem_type="binary", pipeline_parameters=pipeine_parameters, + automl = AutoMLSearch(X, y, problem_type="binary", max_batches=3, allowed_pipelines=[pipeline_]) automl.search() @@ -2930,9 +2902,10 @@ def test_automl_drop_index_columns(mock_train, mock_binary_score, X_y_binary): automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=2) automl.search() for pipeline in automl.allowed_pipelines: + print(pipeline.parameters) assert pipeline.get_component('Drop Columns Transformer') - assert 'Drop Columns Transformer' in pipeline.hyperparameters - assert pipeline.hyperparameters['Drop Columns Transformer'] == {} + assert 'Drop Columns Transformer' in pipeline.parameters + assert pipeline.parameters['Drop Columns Transformer'] == {'columns': ['index_col']} all_drop_column_params = [] for _, row in automl.full_rankings.iterrows(): From 7ff180ce47739ae4cad9cbdbe9490e63226d59e3 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Sun, 30 May 2021 11:20:19 -0400 Subject: [PATCH 31/85] test updates --- evalml/tests/automl_tests/test_automl.py | 215 ++--------------------- 1 file changed, 11 insertions(+), 204 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index bc03b049fe..1d50a1e6b5 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2196,8 +2196,8 @@ def test_automl_pipeline_params_simple(mock_fit, mock_score, X_y_binary): mock_score.return_value = {'Log Loss Binary': 1.0} X, y = X_y_binary params = {"Imputer": {"numeric_impute_strategy": "most_frequent"}, - "Logistic Regression Classifier": {"C": 20, - "penalty": 'none'}, + "Logistic Regression Classifier": {"C": 10, + "penalty": 'l2'}, "Elastic Net Classifier": {"alpha": 0.75, "l1_ratio": 0.2}} automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", pipeline_parameters=params, n_jobs=1) @@ -2206,22 +2206,22 @@ def test_automl_pipeline_params_simple(mock_fit, mock_score, X_y_binary): if 'Imputer' in row['parameters']: assert row['parameters']['Imputer']['numeric_impute_strategy'] == 'most_frequent' if 'Logistic Regression Classifier' in row['parameters']: - assert row['parameters']['Logistic Regression Classifier']['C'] == 20 - assert row['parameters']['Logistic Regression Classifier']['penalty'] == 'none' + assert row['parameters']['Logistic Regression Classifier']['C'] == 10 + assert row['parameters']['Logistic Regression Classifier']['penalty'] == 'l2' if 'Elastic Net Classifier' in row['parameters']: assert row['parameters']['Elastic Net Classifier']['alpha'] == 0.75 assert row['parameters']['Elastic Net Classifier']['l1_ratio'] == 0.2 -'''@patch('evalml.pipelines.RegressionPipeline.fit') +@patch('evalml.pipelines.RegressionPipeline.fit') @patch('evalml.pipelines.RegressionPipeline.score') def test_automl_pipeline_params_multiple(mock_score, mock_fit, X_y_regression): mock_score.return_value = {'R2': 1.0} X, y = X_y_regression - params = {'Imputer': {'numeric_impute_strategy': Categorical(['median', 'most_frequent'])}, + hyperparams = {'Imputer': {'numeric_impute_strategy': Categorical(['median', 'most_frequent'])}, 'Decision Tree Regressor': {'max_depth': Categorical([17, 18, 19]), 'max_features': Categorical(['auto'])}, 'Elastic Net Regressor': {"alpha": Real(0, 0.5), "l1_ratio": Categorical((0.01, 0.02, 0.03))}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', pipeline_parameters=params, n_jobs=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', custom_hyperparameters=hyperparams, n_jobs=1) automl.search() for i, row in automl.rankings.iterrows(): if 'Imputer' in row['parameters']: @@ -2231,7 +2231,7 @@ def test_automl_pipeline_params_multiple(mock_score, mock_fit, X_y_regression): assert row['parameters']['Decision Tree Regressor']['max_features'] == 'auto' if 'Elastic Net Regressor' in row['parameters']: assert 0 < row['parameters']['Elastic Net Regressor']['alpha'] < 0.5 - assert row['parameters']['Elastic Net Regressor']['l1_ratio'] == Categorical((0.01, 0.02, 0.03)).rvs(random_state=automl.random_seed)''' + assert row['parameters']['Elastic Net Regressor']['l1_ratio'] == Categorical((0.01, 0.02, 0.03)).rvs(random_state=automl.random_seed) @patch('evalml.pipelines.BinaryClassificationPipeline.fit') @@ -2329,7 +2329,7 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams(mock_sco automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_one, pipeline_two, pipeline_three], pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}}, custom_hyperparameters={"One Hot Encoder": {"top_n": Categorical([12, 10])}, - "Imputer": {"numeric_impute_strategy": Categorical(["median"])}}, + "Imputer": {"numeric_impute_strategy": Categorical(["median", "most_frequent"])}}, max_batches=4) automl.search() from pprint import pp @@ -2340,9 +2340,8 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams(mock_sco for i, row in automl.full_rankings.iterrows(): if "Mode Baseline Binary" in row['pipeline_name']: continue - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] == "most_frequent" - assert any(row['parameters']["One Hot Encoder"]["top_n"] == 12 for _, row in automl.full_rankings.iterrows() if row["pipeline_name"] == "Pipe Line Two") - assert any(row['parameters']["One Hot Encoder"]["top_n"] == 5 for _, row in automl.full_rankings.iterrows() if row["pipeline_name"] == "Pipe Line One") + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in ["most_frequent", "median"] + assert 10 <= row['parameters']["One Hot Encoder"]["top_n"] <= 12 @patch('evalml.pipelines.MulticlassClassificationPipeline.score') @@ -2363,198 +2362,6 @@ def test_automl_pipeline_params_kwargs(mock_fit, mock_score, X_y_multi): assert row['parameters']['Decision Tree Classifier']['max_depth'] == 1 -#@patch('evalml.pipelines.BinaryClassificationPipeline.score') -#@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_pipelines_true_true_true(X_y_binary): - ''' - The numeric_impute_strategy: most_frequent parameter of Imputer won't be found in the hyperparameter ranges. - The tuner will look for most_frequent but only find mean. - Solution is to ignore conflicting hyperparameters - ''' - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - parameters = { - "Imputer": {'numeric_impute_strategy': 'most_frequent'}, - "Random Forest Classifier": {'n_estimators': 200, - "max_depth": 11} - } - custom_hyperparameters = { - "Random Forest Classifier": {"max_depth": Integer(11, 12)} - } - pipeine_parameters = { - "Random Forest Classifier": {'n_estimators': 222} - } - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) - - automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeine_parameters, - custom_hyperparameters=custom_hyperparameters) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - - -def test_pipelines_true_true_false(X_y_binary): - ''' - The parameters passed in pipeline_parameters are set for the first iteration but the remaining default - to the parameter in the estimator, here from 222 to 100. - The problem is that - Solution - ''' - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - parameters = { - "Imputer": {'numeric_impute_strategy': 'most_frequent'}, - "Random Forest Classifier": {'n_estimators': 222, - "max_depth": 11} - } - pipeine_parameters = { - "Random Forest Classifier": {'n_estimators': 222} - } - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) - - automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_]) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - -def test_pipelines_true_false_true(X_y_binary): - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - parameters = { - "Imputer": {'numeric_impute_strategy': 'most_frequent'}, - "Random Forest Classifier": {'n_estimators': 200, - "max_depth": 11} - } - custom_hyperparameters = { - "Random Forest Classifier": {"max_depth": Integer(11, 12)} - } - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) - - automl = AutoMLSearch(X, y, problem_type="binary", custom_hyperparameters=custom_hyperparameters, - max_batches=3, allowed_pipelines=[pipeline_]) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - -def test_pipelines_true_false_false(X_y_binary): - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - parameters = { - "Imputer": {'numeric_impute_strategy': 'most_frequent'}, - "Random Forest Classifier": {'n_estimators': 200, - "max_depth": 11} - } - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) - - automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_]) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - -def test_pipelines_false_true_true(X_y_binary): - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - custom_hyperparameters = { - "Random Forest Classifier": {"max_depth": Integer(11, 12)} - } - pipeline_parameters = { - "Random Forest Classifier": {'n_estimators': 222} - } - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) - - automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeline_parameters, - custom_hyperparameters=custom_hyperparameters) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_pipelines_false_true_false(mock_fit, mock_score, X_y_binary): - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - pipeline_parameters = { - "Random Forest Classifier": {'n_estimators': 222} - } - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) - - automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_], pipeline_parameters=pipeline_parameters) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - -def test_pipelines_false_false_true(X_y_binary): - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - custom_hyperparameters = { - "Random Forest Classifier": {"max_depth": Integer(11, 12)} - } - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) - - automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_], - custom_hyperparameters=custom_hyperparameters) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - -def test_pipelines_false_false_false(X_y_binary): - X, y = X_y_binary - - component_graph = ['Imputer', 'Random Forest Classifier'] - - pipeline_ = BinaryClassificationPipeline(component_graph=component_graph) - - automl = AutoMLSearch(X, y, problem_type="binary", - max_batches=3, allowed_pipelines=[pipeline_]) - automl.search() - - print(automl.full_rankings) - for pipeline in automl.full_rankings.parameters: - pp(pipeline) - - - @pytest.mark.parametrize("random_seed", [0, 1, 9]) @patch('evalml.pipelines.MulticlassClassificationPipeline.score') @patch('evalml.pipelines.MulticlassClassificationPipeline.fit') From 5839642de6b728c6f7b96ce30755dcb7972fc723 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Sun, 30 May 2021 17:06:58 -0400 Subject: [PATCH 32/85] test update --- evalml/tests/automl_tests/test_automl.py | 38 +++++++++++++----------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 1d50a1e6b5..bd928c6df4 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1799,36 +1799,38 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors TURN THIS INTO A MASSIVE GRID TEST ''' -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch('evalml.pipelines.MulticlassClassificationPipeline.score', return_value={"Log Loss Multiclass": 0.6}) +@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_fit, mock_score, X_y_multi): X, y = X_y_multi + custom_hyperparameters = { "Imputer": { - "numeric_impute_strategy": ["mean"] - } - } - larger_custom = { - "Imputer": { - "numeric_impute_strategy": ["most_frequent", "mean"] + "numeric_impute_strategy": Categorical(["mean"]) }, "Extra Trees Classifier": { - "max_depth": [4, 5, 6, 7], - "max_features": ["auto", "log2"] + "max_depth": Integer(4, 7), + "max_features": Categorical(["auto", "log2"]) } } + estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) - pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] + pipelines = [make_pipeline(X, y, estimator, 'multiclass') for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines, custom_hyperparameters=custom_hyperparameters) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines, + custom_hyperparameters=custom_hyperparameters, max_batches=4) automl.search() - assert automl.best_pipeline.parameters['Imputer']['numeric_impute_strategy'] == "mean" - invalid_pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', max_batches=2, allowed_pipelines=invalid_pipelines, custom_hyperparameters=larger_custom) - automl.search() - for params in automl.full_rankings['parameters'].values[:-1]: - assert params['Imputer']['numeric_impute_strategy'] in larger_custom['Imputer']['numeric_impute_strategy'] + pp(automl.full_rankings) + for pipe in automl.full_rankings.parameters: + pp(pipe) + + for i, row in automl.full_rankings.iterrows(): + if "Mode Baseline Multiclass" in row['pipeline_name']: + continue + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in custom_hyperparameters['Imputer']['numeric_impute_strategy'] + assert 4 <= row["parameters"]["Extra Trees Classifier"]["max_depth"] <= 7 + assert row["parameters"]["Extra Trees Classifier"]["max_features"] in custom_hyperparameters["Extra Trees Classifier"]["max_features"] @patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.6}) From 7134c3395a803b31674ff9e8eb422b98af8442e6 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 31 May 2021 09:03:40 -0400 Subject: [PATCH 33/85] test updates --- .../automl_algorithm/automl_algorithm.py | 9 +- evalml/tests/automl_tests/test_automl.py | 90 ++++++++++--------- .../automl_tests/test_iterative_algorithm.py | 46 +++++++--- 3 files changed, 87 insertions(+), 58 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index c17fdfe584..85202c42f0 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -39,10 +39,11 @@ def __init__(self, print(f"AutoMLAlgorithm - init - pipeline.parameters: {pipeline.parameters}") pipeline_hyperparameters = get_hyperparameter_ranges(pipeline.component_graph, custom_hyperparameters) print(f"AutoMLAlgorithm - init - pipeline.pipeline_hyperparameters: {pipeline_hyperparameters}") - for comp_name in custom_hyperparameters.keys(): - if comp_name in pipeline.parameters.keys(): - print(f"AutoMLAlgorithm - init - hyperparameter in pipeline: {comp_name} - {custom_hyperparameters[comp_name]}") - pipeline_hyperparameters[comp_name].update(custom_hyperparameters[comp_name]) + if custom_hyperparameters: + for comp_name in custom_hyperparameters.keys(): + if comp_name in pipeline.parameters.keys(): + print(f"AutoMLAlgorithm - init - hyperparameter in pipeline: {comp_name} - {custom_hyperparameters[comp_name]}") + pipeline_hyperparameters[comp_name].update(custom_hyperparameters[comp_name]) print(f"AutoMLAlgorithm - init - pipeline_hyperparameters: {pipeline_hyperparameters}") self._tuners[pipeline.name] = self._tuner_class(pipeline_hyperparameters, random_seed=self.random_seed) self._pipeline_number = 0 diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index bd928c6df4..5a13a7fa5d 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1777,60 +1777,62 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors assert "Default parameters for components" not in str(error.value) -''' -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST -TURN THIS INTO A MASSIVE GRID TEST - -''' +@pytest.mark.parametrize("pipelines,pipeline_parameters", [(True, False), (True, True), (False, False)]) +@pytest.mark.parametrize("automl_parameters", [True, False]) +@pytest.mark.parametrize("custom_hyperparameters", [True, False]) @patch('evalml.pipelines.MulticlassClassificationPipeline.score', return_value={"Log Loss Multiclass": 0.6}) @patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_fit, mock_score, X_y_multi): +def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_fit, mock_score, custom_hyperparameters, + automl_parameters, pipelines, pipeline_parameters, + X_y_multi): X, y = X_y_multi - custom_hyperparameters = { - "Imputer": { - "numeric_impute_strategy": Categorical(["mean"]) - }, - "Extra Trees Classifier": { - "max_depth": Integer(4, 7), - "max_features": Categorical(["auto", "log2"]) + pipeline_parameters_ = None + pipeline_ = None + automl_parameters_ = None + custom_hyperparameters_ = None + + if pipeline_parameters: + pipeline_parameters_ = { + "Imputer": {'numeric_impute_strategy': 'most_frequent'}, + "Random Forest Classifier": {'n_estimators': 200, + "max_depth": 11} } - } - estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) - pipelines = [make_pipeline(X, y, estimator, 'multiclass') for estimator in estimators] + if pipelines: + component_graph_ = ['Imputer', 'Random Forest Classifier'] + pipeline_ = [MulticlassClassificationPipeline(component_graph=component_graph_, parameters=pipeline_parameters_)] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines, - custom_hyperparameters=custom_hyperparameters, max_batches=4) - automl.search() + if automl_parameters: + automl_parameters_ = { + "Random Forest Classifier": {'n_estimators': 201} + } + if custom_hyperparameters: + custom_hyperparameters_ = { + "Imputer": { + "numeric_impute_strategy": Categorical(["mean"]) + }, + "Random Forest Classifier": { + "max_depth": Integer(4, 7), + 'n_estimators': Integer(190, 210) + } + } - pp(automl.full_rankings) - for pipe in automl.full_rankings.parameters: - pp(pipe) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipeline_, + pipeline_parameters=automl_parameters_, custom_hyperparameters=custom_hyperparameters_, max_batches=4) + automl.search() for i, row in automl.full_rankings.iterrows(): - if "Mode Baseline Multiclass" in row['pipeline_name']: - continue - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in custom_hyperparameters['Imputer']['numeric_impute_strategy'] - assert 4 <= row["parameters"]["Extra Trees Classifier"]["max_depth"] <= 7 - assert row["parameters"]["Extra Trees Classifier"]["max_features"] in custom_hyperparameters["Extra Trees Classifier"]["max_features"] + if "Random Forest Classifier" in row['pipeline_name']: + if custom_hyperparameters_: + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in custom_hyperparameters_['Imputer']['numeric_impute_strategy'] + assert 4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7 + assert 190 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 210 + else: + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in ["mean", "median", "most_frequent"] + assert 1 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 10 + assert 10 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 1000 + @patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.6}) diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index e7835b23ae..1b930f521c 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -273,7 +273,7 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_regression(n_jobs, linear_r assert seen_ensemble -@pytest.mark.parametrize("parameters", [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Categorical((2, 3, 4))]) +@pytest.mark.parametrize("parameters", [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Integer(2, 4)]) def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_classes): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes(parameters) algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, @@ -281,12 +281,41 @@ def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_c pipeline_params={'pipeline': {"gap": 2, "max_delay": 10}, 'Mock Classifier': {'dummy_parameter': parameters}}) - next_batch = algo.next_batch() parameter = parameters - if isinstance(parameter, Categorical): - parameter = parameter.rvs(random_state=0) - assert all([p.parameters['pipeline'] == {"gap": 2, "max_delay": 10} for p in next_batch]) - assert all([p.parameters['Mock Classifier'] == {"dummy_parameter": parameter, "n_jobs": -1} for p in next_batch]) + if isinstance(parameter, (Categorical, Integer)): + with pytest.raises(ValueError, match="Pipeline parameters should not contain skopt.Space variables"): + algo.next_batch() + else: + next_batch = algo.next_batch() + assert all([p.parameters['pipeline'] == {"gap": 2, "max_delay": 10} for p in next_batch]) + assert all([p.parameters['Mock Classifier'] == {"dummy_parameter": parameter, "n_jobs": -1} for p in next_batch]) + + scores = np.arange(0, len(next_batch)) + for score, pipeline in zip(scores, next_batch): + algo.add_result(score, pipeline, {"id": algo.pipeline_number}) + + # make sure that future batches remain in the hyperparam range + for i in range(1, 5): + next_batch = algo.next_batch() + for p in next_batch: + if isinstance(parameters, Categorical): + assert p.parameters['Mock Classifier']['dummy_parameter'] in parameters + else: + assert p.parameters['Mock Classifier']['dummy_parameter'] == parameter + + +@pytest.mark.parametrize("parameters,hyperparameters", [(1, Categorical([1, 3, 4])), (3, Integer(2, 4))]) +def test_iterative_algorithm_custom_hyperparameters(parameters, hyperparameters, dummy_binary_pipeline_classes): + dummy_binary_pipeline_classes = dummy_binary_pipeline_classes(parameters) + algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, + random_seed=0, + pipeline_params={'Mock Classifier': {'dummy_parameter': parameters}}, + custom_hyperparameters={'Mock Classifier': {'dummy_parameter': hyperparameters}}) + + next_batch = algo.next_batch() + + assert all([p.parameters['Mock Classifier']["n_jobs"] == -1 for p in next_batch]) + assert all([p.parameters['Mock Classifier']["dummy_parameter"] == parameters for p in next_batch]) scores = np.arange(0, len(next_batch)) for score, pipeline in zip(scores, next_batch): @@ -296,10 +325,7 @@ def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_c for i in range(1, 5): next_batch = algo.next_batch() for p in next_batch: - if isinstance(parameters, Categorical): - assert p.parameters['Mock Classifier']['dummy_parameter'] in parameters - else: - assert p.parameters['Mock Classifier']['dummy_parameter'] == parameter + assert p.parameters['Mock Classifier']['dummy_parameter'] in hyperparameters def test_iterative_algorithm_frozen_parameters(): From f52668fe6710db00f27a9a17eeaece0d8226b450 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 31 May 2021 14:42:25 -0400 Subject: [PATCH 34/85] test changes --- .../automl_algorithm/automl_algorithm.py | 8 +- .../automl_algorithm/iterative_algorithm.py | 28 --- evalml/automl/automl_search.py | 19 -- evalml/automl/engine/engine_base.py | 4 - evalml/automl/utils.py | 19 +- evalml/pipelines/component_graph.py | 4 - evalml/tests/automl_tests/test_automl.py | 6 +- .../tests/automl_tests/test_automl_utils.py | 23 ++ .../automl_tests/test_iterative_algorithm.py | 15 +- evalml/tests/conftest.py | 4 +- .../test_permutation_importance.py | 10 +- .../test_binary_classification.py | 6 - .../test_multiclass_classification.py | 6 - .../test_regression.py | 7 - .../pipeline_tests/test_pipeline_utils.py | 56 +---- evalml/tests/pipeline_tests/test_pipelines.py | 204 ++---------------- evalml/tuners/skopt_tuner.py | 2 - evalml/tuners/tuner.py | 7 - 18 files changed, 63 insertions(+), 365 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 85202c42f0..b2e3394fa3 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -25,6 +25,7 @@ def __init__(self, Arguments: allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. + custom_hyperparameters (dict): Custom hyperparameter ranges specified for pipelines to iterate over. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. random_seed (int): Seed for the random number generator. Defaults to 0. @@ -35,16 +36,11 @@ def __init__(self, self._tuner_class = tuner_class or SKOptTuner self._tuners = {} for pipeline in self.allowed_pipelines: - print(f"AutoMLAlgorithm - init - pipeline: {pipeline}") - print(f"AutoMLAlgorithm - init - pipeline.parameters: {pipeline.parameters}") pipeline_hyperparameters = get_hyperparameter_ranges(pipeline.component_graph, custom_hyperparameters) - print(f"AutoMLAlgorithm - init - pipeline.pipeline_hyperparameters: {pipeline_hyperparameters}") if custom_hyperparameters: for comp_name in custom_hyperparameters.keys(): if comp_name in pipeline.parameters.keys(): - print(f"AutoMLAlgorithm - init - hyperparameter in pipeline: {comp_name} - {custom_hyperparameters[comp_name]}") pipeline_hyperparameters[comp_name].update(custom_hyperparameters[comp_name]) - print(f"AutoMLAlgorithm - init - pipeline_hyperparameters: {pipeline_hyperparameters}") self._tuners[pipeline.name] = self._tuner_class(pipeline_hyperparameters, random_seed=self.random_seed) self._pipeline_number = 0 self._batch_number = 0 @@ -67,8 +63,6 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """ if pipeline.name not in self._tuners: raise PipelineNotFoundError(f"No such pipeline allowed in this AutoML search: {pipeline.name}") - print(f"automlalgorithm - add_result - pipeline: {pipeline}") - print(f"automlalgorithm - add_result - pipeline parameters: {pipeline.parameters}") self._tuners[pipeline.name].add(pipeline.parameters, score_to_minimize) @property diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 967add9b0a..bcfc20ad13 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -95,8 +95,6 @@ def next_batch(self): next_batch = [] if self._batch_number == 0: - for pipeline in self.allowed_pipelines: - print(f"iterative_algorithm - next_batch - new pipeline parameters for batch 0: {self._combine_parameters(pipeline, {})}") next_batch = [pipeline.new(parameters=self._combine_parameters(pipeline, {}), random_seed=self.random_seed) for pipeline in self.allowed_pipelines] @@ -120,11 +118,9 @@ def next_batch(self): num_pipelines = (len(self._first_batch_results) + 1) if self.ensembling else len(self._first_batch_results) idx = (self._batch_number - 1) % num_pipelines pipeline = self._first_batch_results[idx][1] - print(f"iterative_algorithm - next_batch - pipeline: {pipeline.parameters}") for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() parameters = self._combine_parameters(pipeline, proposed_parameters) - print(f"iterative_algorithm - next_batch - new pipeline parameters: {parameters}") next_batch.append(pipeline.new(parameters=parameters, random_seed=self.random_seed)) self._pipeline_number += len(next_batch) self._batch_number += 1 @@ -132,11 +128,7 @@ def next_batch(self): def _combine_parameters(self, pipeline, proposed_parameters): """Helper function for logic to transform proposed parameters and frozen parameters.""" - print(f'Iterative Algorithm - _combine_parameters - pipeline: {pipeline}') - print(f'Iterative Algorithm - _combine_parameters - proposed_parameters: {proposed_parameters}') - print(f'Iterative Algorithm - _combine_parameters - self._frozen_pipeline_parameters: {self._frozen_pipeline_parameters}') _returning = {**self._transform_parameters(pipeline, proposed_parameters), **self._frozen_pipeline_parameters} - print(f'Iterative Algorithm - _combine_parameters - returning combined parameters: {_returning}') return _returning def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): @@ -150,8 +142,6 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): if pipeline.model_family != ModelFamily.ENSEMBLE: if self.batch_number == 1: try: - print(f'iterative algorithm - add_result - pipeline: {pipeline}') - print(f'iterative algorithm - add_result - trained_pipeline_results: {trained_pipeline_results}') super().add_result(score_to_minimize, pipeline, trained_pipeline_results) except ValueError as e: if 'is not within the bounds of the space' in str(e): @@ -172,28 +162,16 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): def _transform_parameters(self, pipeline, proposed_parameters): """Given a pipeline parameters dict, make sure n_jobs and number_features are set.""" - print(f"****************************** Batch: {self._batch_number} ******************************") - print(f"iterativealgorithm - _transform_parameters - pipeline: {pipeline}") - print(f"iterativealgorithm - _transform_parameters - pipeline parameters: {pipeline.parameters}") - print(f"iterativealgorithm - _transform_parameters - proposed_parameters: {proposed_parameters}") - print(f"iterativealgorithm - _transform_parameters - self._pipeline_params: {self._pipeline_params}") parameters = {} if 'pipeline' in self._pipeline_params: parameters['pipeline'] = self._pipeline_params['pipeline'] - print(f"iterativealgorithm - _transform_parameters - parameters['pipeline']: {parameters['pipeline']}") for name, component_class in pipeline.linearized_component_graph: - print('-------------------------------------------------------------------------') - print(f"iterativealgorithm - _transform_parameters - component name: {name}") component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters - print(f"iterativealgorithm - _transform_parameters - init_params: {init_params}") # For first batch, pass the pipeline params to the components that need them - print(f"iterativealgorithm - _transform_parameters - component_parameters: {component_parameters}") if name in self._custom_hyperparameters and self._batch_number == 0: - print(f"iterativealgorithm - _transform_parameters - hyperparameter name: {name}") for param_name, value in self._custom_hyperparameters[name].items(): - print(f"iterativealgorithm - _transform_parameters - hyperparameter_name name/value: {param_name} - {value}") if isinstance(value, (Integer, Real)): # get a random value in the space component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] @@ -203,7 +181,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): - print(f"iterativealgorithm - _transform_parameters - self._pipeline_params name/value: {param_name} - {value}") if isinstance(value, (Integer, Real, Categorical)): raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " "to custom_hyperparameters instead!") @@ -215,15 +192,10 @@ def _transform_parameters(self, pipeline, proposed_parameters): if 'number_features' in init_params: component_parameters['number_features'] = self.number_features if name in self._pipeline_params and name == 'Drop Columns Transformer' and self._batch_number > 0: - print(f"iterativealgorithm - _transform_parameters - Drop Columns Transformer: {self._pipeline_params[name]}") component_parameters['columns'] = self._pipeline_params[name]['columns'] if 'pipeline' in self._pipeline_params: - print(f"iterativealgorithm - _transform_parameters - self._pipeline_params end: {self._pipeline_params}") for param_name, value in self._pipeline_params['pipeline'].items(): - print(f"iterativealgorithm - _transform_parameters - self._pipeline_params['pipeline'] name/value: {param_name} - {value}") if param_name in init_params: component_parameters[param_name] = value - print(f"iterativealgorithm - _transform_parameters - component_parameters: {component_parameters}") parameters[name] = component_parameters - print(f"iterativealgorithm - _transform_parameters - parameters: {parameters}") return parameters diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 0a50cc3e96..b7a311b309 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -392,11 +392,6 @@ def __init__(self, if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") - from pprint import pprint as pp - for pipe_ in self.allowed_pipelines: - pp(f"automl_search - init - pipelines: {pipe_}") - pp(f"automl_search - init - pipelines parameters: {pipe_.parameters}") - logger.info(f"{len(self.allowed_pipelines)} pipelines ready for search.") check_all_pipeline_names_unique(self.allowed_pipelines) @@ -445,10 +440,6 @@ def __init__(self, logger.debug(f"allowed_pipelines set to {[pipeline.name for pipeline in self.allowed_pipelines]}") logger.debug(f"allowed_model_families set to {self.allowed_model_families}") - for pipe_ in self.allowed_pipelines: - print(f"automlsearch - init - pre iterative algorithm pipeline parameters: {pipe_.parameters}") - print(f"automlsearch - init - parameters: {parameters}") - self._automl_algorithm = IterativeAlgorithm( max_iterations=self.max_iterations, allowed_pipelines=self.allowed_pipelines, @@ -612,9 +603,7 @@ def search(self, show_iteration_plot=True): log_title(logger, f"Evaluating Batch Number {self._get_batch_number()}") for pipeline in current_batch_pipelines: self._pre_evaluation_callback(pipeline) - print(f"automlsearch - search - pipeline pre submit evaluation: {pipeline.parameters}") computation = self._engine.submit_evaluation_job(self.automl_config, pipeline, self.X_train, self.y_train) - print(f"automlsearch - search - computation: {computation}") computations.append(computation) current_computation_index = 0 while self._should_continue() and len(computations) > 0: @@ -666,7 +655,6 @@ def _find_best_pipeline(self): if self._train_best_pipeline: X_train = self.X_train y_train = self.y_train - print(f"automlsearch - _add_baseline_pipelines - best_pipeline: {best_pipeline}") best_pipeline = self._engine.submit_training_job(self.automl_config, best_pipeline, X_train, y_train).get_result() self._best_pipeline = best_pipeline @@ -762,7 +750,6 @@ def _add_baseline_pipelines(self): baseline = self._get_baseline_pipeline() self._pre_evaluation_callback(baseline) logger.info(f"Evaluating Baseline Pipeline: {baseline.name}") - print(f"automlsearch - _add_baseline_pipelines - baseline: {baseline}") computation = self._engine.submit_evaluation_job(self.automl_config, baseline, self.X_train, self.y_train) evaluation = computation.get_result() data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger") @@ -832,7 +819,6 @@ def _post_evaluation_callback(self, pipeline, evaluation_results, job_log): if not is_baseline: score_to_minimize = -cv_score if self.objective.greater_is_better else cv_score try: - print(f"automlsearch - _post_evaluation_callback - pipeline: {pipeline}") self._automl_algorithm.add_result(score_to_minimize, pipeline, self._results['pipeline_results'][pipeline_id]) except PipelineNotFoundError: pass @@ -935,15 +921,12 @@ def add_to_rankings(self, pipeline): pipeline (PipelineBase): pipeline to train and evaluate. """ pipeline_rows = self.full_rankings[self.full_rankings['pipeline_name'] == pipeline.name] - print(f"automlsearch - add_to_rankings - pipeline_rows: {pipeline_rows}") for parameter in pipeline_rows['parameters']: if pipeline.parameters == parameter: return - print(f"automlsearch - add_to_rankings - pipeline: {pipeline}") computation = self._engine.submit_evaluation_job(self.automl_config, pipeline, self.X_train, self.y_train) evaluation = computation.get_result() data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger") - print(f"automlsearch - add_to_rankings - pipeline: {pipeline}") self._post_evaluation_callback(pipeline, data, job_log) self._find_best_pipeline() @@ -1037,7 +1020,6 @@ def train_pipelines(self, pipelines): y_train = self.y_train for pipeline in pipelines: - print(f"automlsearch - train_pipelines - pipeline: {pipeline}") computations.append(self._engine.submit_training_job(self.automl_config, pipeline, X_train, y_train)) while computations: @@ -1076,7 +1058,6 @@ def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): computations = [] for pipeline in pipelines: - print(f"automlsearch - score_pipelines - pipeline: {pipeline}") computations.append(self._engine.submit_scoring_job(self.automl_config, pipeline, X_holdout, y_holdout, objectives)) while computations: diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index a65d0716f4..5f9de711ef 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -104,8 +104,6 @@ def train_pipeline(pipeline, X, y, optimize_thresholds, objective): Returns: pipeline (PipelineBase): trained pipeline. """ - print(f"engine_base - train_pipeline - pipeline: {pipeline}") - print(f"engine_base - train_pipeline - pipeline parameters: {pipeline.parameters}") X_threshold_tuning = None y_threshold_tuning = None if optimize_thresholds and pipeline.can_tune_threshold_with_objective(objective): @@ -131,8 +129,6 @@ def train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train tuple of three items: First - A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. Second - The pipeline class we trained and scored. Third - the job logger instance with all the recorded messages. """ - print(f"engine_base - train_and_score_pipeline - pipeline: {pipeline}") - print(f"engine_base - train_and_score_pipeline - pipeline parameters: {pipeline.parameters}") start = time.time() cv_data = [] logger.info("\tStarting cross validation") diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index ab6318842f..14fb9730ad 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -155,17 +155,22 @@ def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): return 'Undersampler' -def get_hyperparameter_ranges(linearized_component_graph, custom_hyperparameters): - """Returns hyperparameter ranges from all components as a dictionary""" - linearized_component_graph = ComponentGraph.linearized_component_graph(linearized_component_graph) +def get_hyperparameter_ranges(component_graph, custom_hyperparameters): + """ + Returns hyperparameter ranges from all components as a dictionary. + + Arguments: + component_graph (list): The component_graph of the pipeline. + custom_hyperparameters (dict): The custom hyperparameters to be passed to the pipeline. + + Returns: + dict: Dictionary of hyperparameter ranges for each component in the component graph. + """ + linearized_component_graph = ComponentGraph.linearized_component_graph(component_graph) hyperparameter_ranges = dict() - print(f"utils - get_hyperparameter_ranges - linearized_component_graph: {linearized_component_graph}") - print(f"utils - get_hyperparameter_ranges - custom_hyperparameters: {custom_hyperparameters}") for component_name, component_class in linearized_component_graph: component_hyperparameters = copy.copy(component_class.hyperparameter_ranges) - print(f"utils - get_hyperparameter_ranges - component_hyperparameters: {component_name} - {component_hyperparameters}") if custom_hyperparameters and component_name in custom_hyperparameters: component_hyperparameters.update(custom_hyperparameters.get(component_name, {})) hyperparameter_ranges[component_name] = component_hyperparameters - print(f"utils - get_hyperparameter_ranges - hyperparameter_ranges: {hyperparameter_ranges}") return hyperparameter_ranges diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 2fbdeb92f1..97a148addf 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -97,16 +97,12 @@ def instantiate(self, parameters): raise ValueError(f"Cannot reinstantiate a component graph that was previously instantiated") parameters = parameters or {} - print(f'component_graph - instantiate - parameters: {parameters}') self._is_instantiated = True component_instances = {} for component_name, component_class in self.component_instances.items(): - print(f'component_graph - instantiate - component_name/component_class: {component_name} - {component_class}') component_parameters = parameters.get(component_name, {}) - print(f'component_graph - instantiate - component_parameters: {component_parameters}') try: new_component = component_class(**component_parameters, random_seed=self.random_seed) - print(f'component_graph - instantiate - new_component: {new_component}') except (ValueError, TypeError) as e: self._is_instantiated = False err = "Error received when instantiating component {} with the following arguments {}".format(component_name, component_parameters) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 5a13a7fa5d..5618287b83 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2336,11 +2336,7 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams(mock_sco "Imputer": {"numeric_impute_strategy": Categorical(["median", "most_frequent"])}}, max_batches=4) automl.search() - from pprint import pp - print(automl.full_rankings) - for pipe in automl.full_rankings.parameters: - pp(pipe) - expected_top_n = {"Pipe Line One": {5, 10}, "Pipe Line Two": {12, 10}, "Pipe Line Three": {10}} + for i, row in automl.full_rankings.iterrows(): if "Mode Baseline Binary" in row['pipeline_name']: continue diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 3464c40f5f..b42834a385 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -3,16 +3,20 @@ import pandas as pd import pytest from sklearn.model_selection import KFold, StratifiedKFold +from skopt.space import Categorical, Integer +from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.utils import ( _LARGE_DATA_PERCENT_VALIDATION, _LARGE_DATA_ROW_THRESHOLD, get_best_sampler_for_data, get_default_primary_search_objective, + get_hyperparameter_ranges, make_data_splitter, tune_binary_threshold ) from evalml.objectives import F1, R2, LogLossBinary, LogLossMulticlass +from evalml.pipelines import BinaryClassificationPipeline from evalml.preprocessing.data_splitters import ( TimeSeriesSplit, TrainingValidationSplit @@ -207,3 +211,22 @@ def test_get_best_sampler_for_data_sampler_method(categorical_columns, sampler_m assert name_output == 'SMOTENC Oversampler' else: assert name_output == 'SMOTEN Oversampler' + + +def test_get_hyperparameter_ranges(): + pipeline_ = BinaryClassificationPipeline(component_graph=["Imputer", "Random Forest Classifier"]) + custom_hyperparameters_ = { + "Imputer": { + "numeric_impute_strategy": Categorical(["most_frequent", "mean"]) + }, + "Random Forest Classifier": { + "n_estimators": Integer(150, 160) + } + } + algo = IterativeAlgorithm(allowed_pipelines=[pipeline_], + random_seed=0, + custom_hyperparameters=custom_hyperparameters_) + algo_ranges = algo._tuners['Random Forest Classifier w/ Imputer']._pipeline_hyperparameter_ranges + hyper_ranges = get_hyperparameter_ranges(pipeline_.component_graph, custom_hyperparameters_) + + assert algo_ranges == hyper_ranges diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 1b930f521c..114012053a 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -273,7 +273,7 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_regression(n_jobs, linear_r assert seen_ensemble -@pytest.mark.parametrize("parameters", [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Integer(2, 4)]) +@pytest.mark.parametrize("parameters", [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Integer(2, 4), Real(2, 6)]) def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_classes): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes(parameters) algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, @@ -282,7 +282,7 @@ def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_c 'Mock Classifier': {'dummy_parameter': parameters}}) parameter = parameters - if isinstance(parameter, (Categorical, Integer)): + if isinstance(parameter, (Categorical, Integer, Real)): with pytest.raises(ValueError, match="Pipeline parameters should not contain skopt.Space variables"): algo.next_batch() else: @@ -294,14 +294,10 @@ def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_c for score, pipeline in zip(scores, next_batch): algo.add_result(score, pipeline, {"id": algo.pipeline_number}) - # make sure that future batches remain in the hyperparam range + # make sure that future batches have the same parameter value for i in range(1, 5): next_batch = algo.next_batch() - for p in next_batch: - if isinstance(parameters, Categorical): - assert p.parameters['Mock Classifier']['dummy_parameter'] in parameters - else: - assert p.parameters['Mock Classifier']['dummy_parameter'] == parameter + assert all([p.parameters['Mock Classifier']['dummy_parameter'] == parameter for p in next_batch]) @pytest.mark.parametrize("parameters,hyperparameters", [(1, Categorical([1, 3, 4])), (3, Integer(2, 4))]) @@ -324,8 +320,7 @@ def test_iterative_algorithm_custom_hyperparameters(parameters, hyperparameters, # make sure that future batches remain in the hyperparam range for i in range(1, 5): next_batch = algo.next_batch() - for p in next_batch: - assert p.parameters['Mock Classifier']['dummy_parameter'] in hyperparameters + assert all([p.parameters['Mock Classifier']['dummy_parameter'] in hyperparameters for p in next_batch]) def test_iterative_algorithm_frozen_parameters(): diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 552f68531f..7617f17025 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -44,7 +44,7 @@ class MockBinaryPipelineWithOnlyEstimator(BinaryClassificationPipeline): component_graph = [estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) return MockBinaryPipelineWithOnlyEstimator elif problem_type == ProblemTypes.MULTICLASS: class MockMulticlassPipelineWithOnlyEstimator(MulticlassClassificationPipeline): @@ -52,7 +52,7 @@ class MockMulticlassPipelineWithOnlyEstimator(MulticlassClassificationPipeline): component_graph = [estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) return MockMulticlassPipelineWithOnlyEstimator elif problem_type == ProblemTypes.REGRESSION: diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index 38a4ecb278..4c149be493 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -175,7 +175,7 @@ class PipelineWithDimReduction(BinaryClassificationPipeline): component_graph = [PCA, 'Logistic Regression Classifier'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) class EnsembleDag(BinaryClassificationPipeline): @@ -192,28 +192,28 @@ class EnsembleDag(BinaryClassificationPipeline): } def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) class PipelineWithDFS(BinaryClassificationPipeline): component_graph = [DFSTransformer, 'Logistic Regression Classifier'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) class PipelineWithCustomComponent(BinaryClassificationPipeline): component_graph = [DoubleColumns, 'Logistic Regression Classifier'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) class StackedEnsemblePipeline(BinaryClassificationPipeline): component_graph = ['Stacked Ensemble Classifier'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) pipelines_that_do_not_support_fast_permutation_importance = [PipelineWithDimReduction, diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py index befc24d891..f4af3c0a57 100644 --- a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py +++ b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py @@ -33,12 +33,8 @@ def test_binary_init(): 'n_jobs': -1 } } - assert clf.custom_hyperparameters is None assert clf.name == "Random Forest Classifier w/ Imputer + One Hot Encoder" assert clf.random_seed == 0 - custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["median", 'mean'])}, - "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}} parameters = { "One Hot Encoder": { "top_n": 20 @@ -46,7 +42,6 @@ def test_binary_init(): } clf = BinaryClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"], parameters=parameters, - custom_hyperparameters=custom_hyperparameters, custom_name="Custom Pipeline", random_seed=42) @@ -71,7 +66,6 @@ def test_binary_init(): 'n_jobs': -1 } } - assert clf.custom_hyperparameters == custom_hyperparameters assert clf.name == "Custom Pipeline" assert clf.random_seed == 42 diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py index 7568ad64d5..0580abf78a 100644 --- a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py +++ b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py @@ -27,12 +27,8 @@ def test_multiclass_init(): 'n_jobs': -1 } } - assert clf.custom_hyperparameters is None assert clf.name == "Random Forest Classifier w/ Imputer + One Hot Encoder" assert clf.random_seed == 0 - custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["median", 'mean'])}, - "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}} parameters = { "One Hot Encoder": { "top_n": 20 @@ -40,7 +36,6 @@ def test_multiclass_init(): } clf = MulticlassClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"], parameters=parameters, - custom_hyperparameters=custom_hyperparameters, custom_name="Custom Pipeline", random_seed=42) @@ -65,6 +60,5 @@ def test_multiclass_init(): 'n_jobs': -1 } } - assert clf.custom_hyperparameters == custom_hyperparameters assert clf.name == "Custom Pipeline" assert clf.random_seed == 42 diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py index 83bb296fe1..2ebee0c280 100644 --- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py +++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py @@ -1,6 +1,5 @@ import pandas as pd import pytest -from skopt.space import Categorical from evalml.demos import load_breast_cancer, load_diabetes, load_wine from evalml.pipelines import RegressionPipeline @@ -30,12 +29,8 @@ def test_regression_init(): 'n_jobs': -1 } } - assert clf.custom_hyperparameters is None assert clf.name == "Random Forest Regressor w/ Imputer + One Hot Encoder" assert clf.random_seed == 0 - custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["median", 'mean'])}, - "Random Forest Regressor": {"n_estimators": Categorical([50, 100])}} parameters = { "One Hot Encoder": { "top_n": 20 @@ -43,7 +38,6 @@ def test_regression_init(): } clf = RegressionPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Regressor"], parameters=parameters, - custom_hyperparameters=custom_hyperparameters, custom_name="Custom Pipeline", random_seed=42) @@ -68,7 +62,6 @@ def test_regression_init(): 'n_jobs': -1 } } - assert clf.custom_hyperparameters == custom_hyperparameters assert clf.name == "Custom Pipeline" assert clf.random_seed == 42 diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 716e62887b..638aa52d3c 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -38,44 +38,6 @@ from evalml.problem_types import ProblemTypes, is_time_series -def test_make_pipeline_error(): - X = pd.DataFrame([[0, 1], [1, 0]]) - y = pd.Series([1, 0]) - estimators = get_estimators(problem_type="binary") - custom_hyperparameters = [{"Imputer": {"numeric_imput_strategy": ["median"]}}, {"One Hot Encoder": {"value1": ["value2"]}}] - - for estimator in estimators: - with pytest.raises(ValueError, match="if custom_hyperparameters provided, must be dictionary"): - make_pipeline(X, y, estimator, "binary", {}, custom_hyperparameters) - - -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION, - ProblemTypes.TIME_SERIES_REGRESSION]) -def test_make_pipeline_custom_hyperparameters(problem_type): - X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], - "categorical": ["a", "b", "a", "c", "c"], - "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) - custom_hyperparameters = {'Imputer': { - 'numeric_impute_strategy': ['median'] - }} - - y = pd.Series([0, 0, 1, 0, 0]) - estimators = get_estimators(problem_type=problem_type) - - for estimator_class in estimators: - for problem_type in estimator_class.supported_problem_types: - parameters = {} - if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": "some dates", "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": "some dates", "gap": 1, "max_delay": 1}} - - pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters, custom_hyperparameters) - assert pipeline.custom_hyperparameters == custom_hyperparameters - - pipeline2 = make_pipeline(X, y, estimator_class, problem_type, parameters) - assert not pipeline2.custom_hyperparameters - - @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): @@ -101,7 +63,6 @@ def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type) and estimator_class.model_family != ModelFamily.ARIMA: delayed_features = [DelayedFeatureTransformer] @@ -137,7 +98,6 @@ def test_make_pipeline(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -178,7 +138,6 @@ def test_make_pipeline_no_nulls(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -219,7 +178,6 @@ def test_make_pipeline_no_datetimes(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -257,7 +215,6 @@ def test_make_pipeline_no_column_names(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -298,7 +255,6 @@ def test_make_pipeline_text_columns(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -338,7 +294,6 @@ def test_make_pipeline_only_text_columns(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -375,7 +330,6 @@ def test_make_pipeline_only_datetime_columns(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -444,7 +398,6 @@ def test_make_pipeline_datetime_no_categorical(input_type, problem_type): pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) - assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] @@ -649,17 +602,12 @@ def __init__(self, random_arg=False, numpy_arg=[], random_seed=0): def test_generate_code_pipeline(): - custom_hyperparameters = { - "Imputer": { - "numeric_impute_strategy": 'most_frequent' - } - } - binary_pipeline = BinaryClassificationPipeline(['Imputer', 'Random Forest Classifier'], custom_hyperparameters=custom_hyperparameters) + binary_pipeline = BinaryClassificationPipeline(['Imputer', 'Random Forest Classifier']) expected_code = "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \ "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'Random Forest Classifier'], " \ "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \ - "'Random Forest Classifier':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_hyperparameters={'Imputer':{'numeric_impute_strategy': 'most_frequent'}}, random_seed=0)" + "'Random Forest Classifier':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, random_seed=0)" pipeline = generate_pipeline_code(binary_pipeline) assert expected_code == pipeline diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index 73d15f8213..e15e49d69e 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -243,7 +243,7 @@ class DummyNonlinearPipeline(BinaryClassificationPipeline): 'Random Forest': ['Random Forest Classifier', 'Logistic Regression', 'Elastic Net']} def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) class DummyTransformerEndPipeline(BinaryClassificationPipeline): component_graph = {'Imputer': ['Imputer'], @@ -253,7 +253,7 @@ class DummyTransformerEndPipeline(BinaryClassificationPipeline): 'Scaler': ['Standard Scaler', 'Random Forest', 'Logistic Regression']} def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) nlbp = DummyNonlinearPipeline({}) nltp = DummyTransformerEndPipeline({}) @@ -373,28 +373,6 @@ def test_name(): assert pipeline_with_neat_name.custom_name == "some_neat_name" -def test_custom_hyperparameters(): - custom_hyperparameters = { - "Imputer": { - "numeric_impute_strategy": ["most_frequent", "median"] - } - } - pipeline = BinaryClassificationPipeline(['Imputer', 'Logistic Regression Classifier'], - custom_hyperparameters=custom_hyperparameters) - assert pipeline.custom_hyperparameters == custom_hyperparameters - expected_hyperparameters = { - 'Imputer': { - 'categorical_impute_strategy': ['most_frequent'], - 'numeric_impute_strategy': ['most_frequent', 'median'] - }, - 'Logistic Regression Classifier': { - 'penalty': ['l2'], - 'C': Real(low=0.01, high=10, prior='uniform', transform='identity') - } - } - assert pipeline.hyperparameters == expected_hyperparameters - - def test_multi_format_creation(X_y_binary): X, y = X_y_binary component_graph = component_graph = ['Imputer', 'One Hot Encoder', StandardScaler, 'Logistic Regression Classifier'] @@ -786,155 +764,6 @@ def test_correct_nonlinear_parameters(nonlinear_binary_pipeline_class): assert nlb_pipeline['OneHot_ElasticNet'].parameters['top_n'] == 10 -def test_hyperparameters(): - expected_hyperparameters = { - 'Imputer': { - "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["mean", "median", "most_frequent"] - }, - 'Random Forest Classifier': { - "n_estimators": Integer(10, 1000), - "max_depth": Integer(1, 10) - } - } - component_graph = ['Imputer', 'Random Forest Classifier'] - assert BinaryClassificationPipeline(component_graph=component_graph, parameters={}).hyperparameters == expected_hyperparameters - - -def test_nonlinear_hyperparameters(nonlinear_regression_pipeline_class): - hyperparameters = { - 'Imputer': { - "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["mean", "median", "most_frequent"] - }, - 'OneHot': { - }, - 'Random Forest': { - "n_estimators": Integer(10, 1000), - "max_depth": Integer(1, 32) - }, - 'Elastic Net': { - 'alpha': Real(0, 1), - 'l1_ratio': Real(0, 1) - }, - 'Linear Regressor': { - 'fit_intercept': [True, False], - 'normalize': [True, False] - } - } - assert nonlinear_regression_pipeline_class(parameters={}).hyperparameters == hyperparameters - - -def test_hyperparameters_override(): - class MockPipelineOverRide(BinaryClassificationPipeline): - component_graph = ['Imputer', 'Random Forest Classifier'] - - custom_hyperparameters = { - 'Imputer': { - "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["median", "most_frequent"] - }, - 'Random Forest Classifier': { - "n_estimators": [1, 100, 200], - "max_depth": [5] - } - } - - def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, None, parameters, custom_hyperparameters=self.custom_hyperparameters) - - hyperparameters = { - 'Imputer': { - "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["median", "most_frequent"] - }, - 'Random Forest Classifier': { - "n_estimators": [1, 100, 200], - "max_depth": [5] - } - } - - assert MockPipelineOverRide.custom_hyperparameters == hyperparameters - assert MockPipelineOverRide(parameters={}).hyperparameters == hyperparameters - - -def test_nonlinear_hyperparameters_override(): - class NonLinearRegressionPipelineOverRide(RegressionPipeline): - component_graph = { - 'Imputer': ['Imputer'], - 'OneHot': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Regressor', 'OneHot.x'], - 'Elastic Net': ['Elastic Net Regressor', 'OneHot.x'], - 'Linear Regressor': ['Linear Regressor', 'Random Forest', 'Elastic Net'] - } - custom_hyperparameters = { - 'Imputer': { - "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["median", "most_frequent"] - }, - 'Random Forest': { - "n_estimators": [1, 100, 200], - "max_depth": [5] - } - } - - def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, None, parameters, custom_hyperparameters=self.custom_hyperparameters) - - hyperparameters = { - 'Imputer': { - "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["median", "most_frequent"] - }, - 'OneHot': { - }, - 'Random Forest': { - "n_estimators": [1, 100, 200], - "max_depth": [5] - }, - 'Elastic Net': { - 'alpha': Real(0, 1), - 'l1_ratio': Real(0, 1) - }, - 'Linear Regressor': { - 'fit_intercept': [True, False], - 'normalize': [True, False] - } - } - assert NonLinearRegressionPipelineOverRide(parameters={}).hyperparameters == hyperparameters - - -def test_hyperparameters_none(dummy_classifier_estimator_class): - class MockEstimator(Estimator): - name = "Mock Classifier" - model_family = ModelFamily.NONE - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] - hyperparameter_ranges = {} - - def __init__(self, random_seed=0): - super().__init__(parameters={}, component_obj=None, random_seed=random_seed) - - pipeline = BinaryClassificationPipeline(component_graph=[MockEstimator]) - assert pipeline.component_graph == [MockEstimator] - assert pipeline.hyperparameters == {'Mock Classifier': {}} - - -def test_hyperparameters_linear_pipeline_duplicate_components(): - pipeline = BinaryClassificationPipeline(component_graph=["One Hot Encoder", "One Hot Encoder", "Random Forest Classifier"]) - assert pipeline.hyperparameters == {'One Hot Encoder': {}, - "One Hot Encoder_1": {}, - 'Random Forest Classifier': {'n_estimators': Integer(10, 1000), - 'max_depth': Integer(1, 10)}} - - pipeline = BinaryClassificationPipeline(component_graph=["One Hot Encoder", "One Hot Encoder", "Random Forest Classifier"], - custom_hyperparameters={"One Hot Encoder_1": {"top_n": Integer(10, 50)}}) - - assert pipeline.hyperparameters == {'One Hot Encoder': {}, - "One Hot Encoder_1": {"top_n": Integer(10, 50)}, - 'Random Forest Classifier': {'n_estimators': Integer(10, 1000), - 'max_depth': Integer(1, 10)}} - - @patch('evalml.pipelines.components.Estimator.predict') def test_score_with_objective_that_requires_predict_proba(mock_predict, dummy_regression_pipeline_class, X_y_binary): X, y = X_y_binary @@ -1310,14 +1139,14 @@ class MockPipeline(pipeline_class): component_graph = ['Imputer', final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) class MockPipelineWithADifferentClassName(pipeline_class): custom_name = "Mock Pipeline" component_graph = ['Imputer', final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) assert MockPipeline(parameters={}) != MockPipelineWithADifferentClassName(parameters={}) @@ -1334,7 +1163,7 @@ class MockPipeline(pipeline_class): component_graph = ['Imputer', final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) class MockPipelineSubclass(MockPipeline): pass @@ -1368,7 +1197,7 @@ class MockPipeline(pipeline_class): component_graph = ['Imputer', final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) # Test self-equality mock_pipeline = MockPipeline(parameters={}) @@ -1499,21 +1328,21 @@ class MockBinaryPipeline(BinaryClassificationPipeline): component_graph = ['Imputer', 'Random Forest Classifier'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) class MockMulticlassPipeline(MulticlassClassificationPipeline): custom_name = "Mock Multiclass Pipeline" component_graph = ['Imputer', 'Random Forest Classifier'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) class MockRegressionPipeline(RegressionPipeline): custom_name = "Mock Regression Pipeline" component_graph = ['Imputer', 'Random Forest Regressor'] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) binary_pipeline = MockBinaryPipeline(parameters={}) multiclass_pipeline = MockMulticlassPipeline(parameters={}) @@ -1533,25 +1362,16 @@ def test_pipeline_repr(pipeline_class): custom_name = "Mock Pipeline" component_graph = ['Imputer', final_estimator] - custom_hyperparameters = { - "Imputer": { - "numeric_impute_strategy": ['mean', 'median'] - }, - final_estimator: { - "n_estimators": Integer(50, 100) - } - } - pipeline = pipeline_class(component_graph=component_graph, custom_name=custom_name, custom_hyperparameters=custom_hyperparameters) + + pipeline = pipeline_class(component_graph=component_graph, custom_name=custom_name) expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " \ f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, " \ - f"custom_hyperparameters={{'Imputer':{{'numeric_impute_strategy': ['mean', 'median']}}, '{final_estimator}':{{'n_estimators': Integer(low=50, high=100, prior='uniform', transform='identity')}}}}, " \ "custom_name='Mock Pipeline', random_seed=0)" assert repr(pipeline) == expected_repr - pipeline_with_parameters = pipeline_class(component_graph=component_graph, parameters={'Imputer': {'numeric_fill_value': 42}}, custom_name=custom_name, custom_hyperparameters=custom_hyperparameters) + pipeline_with_parameters = pipeline_class(component_graph=component_graph, parameters={'Imputer': {'numeric_fill_value': 42}}, custom_name=custom_name) expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " \ f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': 42}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, " \ - f"custom_hyperparameters={{'Imputer':{{'numeric_impute_strategy': ['mean', 'median']}}, '{final_estimator}':{{'n_estimators': Integer(low=50, high=100, prior='uniform', transform='identity')}}}}, " \ "custom_name='Mock Pipeline', random_seed=0)" assert repr(pipeline_with_parameters) == expected_repr diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index c3ae7d537a..3b5bd46e1d 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -34,12 +34,10 @@ def add(self, pipeline_parameters, score): Returns: None """ - print(f'SKOptTuner - add - pipeline parameters: {pipeline_parameters}') # skip adding nan scores if pd.isnull(score): return flat_parameter_values = self._convert_to_flat_parameters(pipeline_parameters) - print(f'SKOptTuner - add - flat_parameter_values: {flat_parameter_values}') try: self.opt.tell(flat_parameter_values, score) except Exception as e: diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 26a47a72c3..21b61e932f 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -36,21 +36,14 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): self._parameter_names_map[flat_parameter_name] = (component_name, parameter_name) self._search_space_names.append(flat_parameter_name) self._search_space_ranges.append(parameter_range) - print(f"Tuner - __init__ - self._parameter_names_map: {self._parameter_names_map}") - print(f"Tuner - __init__ - self._search_space_names: {self._search_space_names}") - print(f"Tuner - __init__ - self.self._search_space_ranges: {self._search_space_ranges}") def _convert_to_flat_parameters(self, pipeline_parameters): """Convert from pipeline parameters to a flat list of values""" flat_parameter_values = [] - print(f"tuner - _convert_to_flat_parameters - self._search_space_names: {self._search_space_names}") for flat_parameter_name in self._search_space_names: - print(f"tuner - _convert_to_flat_parameters - flat_parameter_name: {flat_parameter_name}") - print(f"tuner - _convert_to_flat_parameters - self._parameter_names_map[flat_parameter_name]: {self._parameter_names_map[flat_parameter_name]}") component_name, parameter_name = self._parameter_names_map[flat_parameter_name] if component_name not in pipeline_parameters or parameter_name not in pipeline_parameters[component_name]: raise TypeError('Pipeline parameters missing required field "{}" for component "{}"'.format(parameter_name, component_name)) - print(f"tuner - _convert_to_flat_parameters - adding value to flat parameters: {component_name} - {parameter_name} - {pipeline_parameters[component_name][parameter_name]}") flat_parameter_values.append(pipeline_parameters[component_name][parameter_name]) return flat_parameter_values From 23aa37bb9eb35e3342ab3e96437cba3390ed946d Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 31 May 2021 17:25:49 -0400 Subject: [PATCH 35/85] test update --- .../prediction_explanations_tests/test_algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py index a1ae9647db..c83055c350 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py @@ -42,7 +42,7 @@ class Pipeline(base_class): custom_name = estimator.name def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, custom_hyperparameters=None, random_seed=random_seed) + super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) return Pipeline From 0deb613594bda9fcc7e4df07fe4dedd762f78471 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 31 May 2021 18:38:18 -0400 Subject: [PATCH 36/85] test updates --- evalml/automl/engine/sequential_engine.py | 1 - evalml/tests/automl_tests/test_automl.py | 9 ++++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index 349ea8506e..0912843ac5 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -43,7 +43,6 @@ class SequentialEngine(EngineBase): def submit_evaluation_job(self, automl_config, pipeline, X, y): logger = self.setup_job_log() - print(f"sequentialengine - submit_evaluation_job - pipeline: {pipeline}") return SequentialComputation(work=evaluate_pipeline, pipeline=pipeline, automl_config=automl_config, X=X, diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 5618287b83..52504092a7 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1786,6 +1786,7 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_ automl_parameters, pipelines, pipeline_parameters, X_y_multi): X, y = X_y_multi + X = pd.DataFrame(X, columns=[f'Column_{i}' for i in range(20)]) pipeline_parameters_ = None pipeline_ = None @@ -1794,17 +1795,19 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_ if pipeline_parameters: pipeline_parameters_ = { + "Drop Columns Transformer": {'columns': ['Column_0', 'Column_1', 'Column_2']}, "Imputer": {'numeric_impute_strategy': 'most_frequent'}, "Random Forest Classifier": {'n_estimators': 200, "max_depth": 11} } if pipelines: - component_graph_ = ['Imputer', 'Random Forest Classifier'] + component_graph_ = ['Drop Columns Transformer', 'Imputer', 'Random Forest Classifier'] pipeline_ = [MulticlassClassificationPipeline(component_graph=component_graph_, parameters=pipeline_parameters_)] if automl_parameters: automl_parameters_ = { + "Drop Columns Transformer": {'columns': ['Column_0', 'Column_1', 'Column_2']}, "Random Forest Classifier": {'n_estimators': 201} } if custom_hyperparameters: @@ -1824,6 +1827,10 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_ for i, row in automl.full_rankings.iterrows(): if "Random Forest Classifier" in row['pipeline_name']: + if pipelines and automl_parameters: + assert row["parameters"]["Drop Columns Transformer"]["columns"] == ['Column_0', 'Column_1', 'Column_2'] + elif pipeline_parameters: + assert row["parameters"]["Drop Columns Transformer"]["columns"] is None if custom_hyperparameters_: assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in custom_hyperparameters_['Imputer']['numeric_impute_strategy'] assert 4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7 From 2fd843eff59e1fc51df719aae75e93dc0b0a82bb Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 31 May 2021 22:54:00 -0400 Subject: [PATCH 37/85] test updates --- .../automl_algorithm/automl_algorithm.py | 2 +- evalml/automl/automl_search.py | 2 +- evalml/tests/automl_tests/test_automl.py | 27 ++++++++++--------- .../test_binary_classification.py | 1 - .../test_multiclass_classification.py | 2 -- evalml/tests/pipeline_tests/test_pipelines.py | 2 -- 6 files changed, 17 insertions(+), 19 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index b2e3394fa3..36d5b6deae 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod +from evalml.automl.utils import get_hyperparameter_ranges from evalml.exceptions import PipelineNotFoundError from evalml.tuners import SKOptTuner -from evalml.automl.utils import get_hyperparameter_ranges class AutoMLAlgorithmException(Exception): diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index b7a311b309..1665c07ca6 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -232,7 +232,7 @@ def __init__(self, } custom_hyperparameters (dict): A dict of the hyperparameter ranges used to iterate over during search. - Keys should consist of the component names and values should specify lists or skopt.Space with length greater than 1 + Keys should consist of the component names and values should specify an skopt.Space. e.g. custom_hyperparameters = { 'Imputer' : { 'numeric_impute_strategy': Categorical(['most_frequent', 'median']) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 52504092a7..87f913a367 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -3,7 +3,6 @@ from collections import OrderedDict from itertools import product from unittest.mock import MagicMock, PropertyMock, patch -from pprint import pp import cloudpickle import numpy as np @@ -1834,12 +1833,17 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_ if custom_hyperparameters_: assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in custom_hyperparameters_['Imputer']['numeric_impute_strategy'] assert 4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7 - assert 190 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 210 + if automl_parameters and row["id"] == 1: + assert row["parameters"]["Random Forest Classifier"]["n_estimators"] == 201 + else: + assert 190 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 210 else: assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in ["mean", "median", "most_frequent"] assert 1 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 10 - assert 10 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 1000 - + if automl_parameters and row["id"] == 1: + assert row["parameters"]["Random Forest Classifier"]["n_estimators"] == 201 + else: + assert 10 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 1000 @patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.6}) @@ -2230,8 +2234,8 @@ def test_automl_pipeline_params_multiple(mock_score, mock_fit, X_y_regression): mock_score.return_value = {'R2': 1.0} X, y = X_y_regression hyperparams = {'Imputer': {'numeric_impute_strategy': Categorical(['median', 'most_frequent'])}, - 'Decision Tree Regressor': {'max_depth': Categorical([17, 18, 19]), 'max_features': Categorical(['auto'])}, - 'Elastic Net Regressor': {"alpha": Real(0, 0.5), "l1_ratio": Categorical((0.01, 0.02, 0.03))}} + 'Decision Tree Regressor': {'max_depth': Categorical([17, 18, 19]), 'max_features': Categorical(['auto'])}, + 'Elastic Net Regressor': {"alpha": Real(0, 0.5), "l1_ratio": Categorical((0.01, 0.02, 0.03))}} automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', custom_hyperparameters=hyperparams, n_jobs=1) automl.search() for i, row in automl.rankings.iterrows(): @@ -2292,9 +2296,8 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen if graph_type == 'linear': custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["mean"])}, - "Imputer_1": { - "numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}} + "Imputer_1": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, + "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}} component_graph = ["Imputer", "Imputer", "Random Forest Classifier"] pipeline_ = BinaryClassificationPipeline(component_graph) else: @@ -2302,8 +2305,8 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen "Imputer_1": {"numeric_impute_strategy": Categorical(["median", 'mean'])}, "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}} component_graph = {"Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"]} + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"]} pipeline_ = BinaryClassificationPipeline(component_graph, custom_name="Pipeline from dict") automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_], custom_hyperparameters=custom_hyperparameters, max_batches=5) @@ -2357,7 +2360,7 @@ def test_automl_pipeline_params_kwargs(mock_fit, mock_score, X_y_multi): mock_score.return_value = {'Log Loss Multiclass': 1.0} X, y = X_y_multi hyperparams = {'Imputer': {'numeric_impute_strategy': Categorical(['most_frequent'])}, - 'Decision Tree Classifier': {'max_depth': Integer(1, 2), 'ccp_alpha': Real(0.1, 0.5)}} + 'Decision Tree Classifier': {'max_depth': Integer(1, 2), 'ccp_alpha': Real(0.1, 0.5)}} automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', custom_hyperparameters=hyperparams, allowed_model_families=[ModelFamily.DECISION_TREE], n_jobs=1) automl.search() diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py index f4af3c0a57..ffb9c9763a 100644 --- a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py +++ b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py @@ -3,7 +3,6 @@ import pandas as pd import pytest import woodwork as ww -from skopt.space import Categorical from evalml.exceptions import PipelineScoreError from evalml.objectives import FraudCost, get_objective diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py index 0580abf78a..ef988bd87a 100644 --- a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py +++ b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py @@ -1,6 +1,4 @@ -from skopt.space import Categorical - from evalml.pipelines import MulticlassClassificationPipeline diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index e15e49d69e..0485ffa4bb 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -7,7 +7,6 @@ import pytest import woodwork as ww from pandas.testing import assert_frame_equal -from skopt.space import Integer, Real from evalml.demos import load_breast_cancer, load_wine from evalml.exceptions import ( @@ -32,7 +31,6 @@ ) from evalml.pipelines.components import ( ElasticNetClassifier, - Estimator, Imputer, LogisticRegressionClassifier, OneHotEncoder, From b0b4c306b41a82a158ab448903e40dd9ab40c9f1 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 1 Jun 2021 08:41:04 -0400 Subject: [PATCH 38/85] Release notes and dask engine test update --- docs/source/release_notes.rst | 1 + evalml/tests/automl_tests/dask_test_utils.py | 8 ++++---- .../latest_dependency_versions.txt | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index a5b6c4a0b5..2c57224df6 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -6,6 +6,7 @@ Release Notes * Fixes * Changes * Deleted the ``return_pandas`` flag from our demo data loaders :pr:`2181` + * Separated `custom_hyperparameters` from pipelines and added them as an argument to `AutoMLSearch :pr:`2317` * Documentation Changes .. warning:: diff --git a/evalml/tests/automl_tests/dask_test_utils.py b/evalml/tests/automl_tests/dask_test_utils.py index d5a57925c3..7a45f4a913 100644 --- a/evalml/tests/automl_tests/dask_test_utils.py +++ b/evalml/tests/automl_tests/dask_test_utils.py @@ -122,16 +122,16 @@ def fit(self, X, y): class TestSchemaCheckPipeline(BinaryClassificationPipeline): - def __init__(self, component_graph, parameters=None, custom_name=None, custom_hyperparameters=None, random_seed=0, + def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0, X_schema_to_check=None, y_schema_to_check=None): self.X_schema_to_check = X_schema_to_check self.y_schema_to_check = y_schema_to_check - super().__init__(component_graph, parameters, custom_name, custom_hyperparameters, random_seed) + super().__init__(component_graph, parameters, custom_name, random_seed) def clone(self): return self.__class__(self.component_graph, parameters=self.parameters, custom_name=self.custom_name, - custom_hyperparameters=self.custom_hyperparameters, random_seed=self.random_seed, - X_schema_to_check=self.X_schema_to_check, y_schema_to_check=self.y_schema_to_check) + random_seed=self.random_seed, X_schema_to_check=self.X_schema_to_check, + y_schema_to_check=self.y_schema_to_check) def fit(self, X, y): assert X.ww.schema == self.X_schema_to_check diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index e7ba7ea254..eb4dddd512 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -2,7 +2,7 @@ catboost==0.25.1 click==8.0.1 cloudpickle==1.6.0 colorama==0.4.4 -dask==2021.5.0 +dask==2021.5.1 featuretools==0.24.1 graphviz==0.16 imbalanced-learn==0.8.0 From 7f2f81d28f6b4b5b8b8a5d60a5e7cdf23ef512b1 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 1 Jun 2021 09:35:31 -0400 Subject: [PATCH 39/85] docs update --- docs/source/user_guide/pipelines.ipynb | 53 +++++--------------------- evalml/automl/automl_search.py | 2 +- 2 files changed, 10 insertions(+), 45 deletions(-) diff --git a/docs/source/user_guide/pipelines.ipynb b/docs/source/user_guide/pipelines.ipynb index 40e6a69bcb..5c5069d586 100644 --- a/docs/source/user_guide/pipelines.ipynb +++ b/docs/source/user_guide/pipelines.ipynb @@ -147,40 +147,6 @@ "print(\"Pipeline with custom name:\", pipeline_with_name.name)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Override Component Hyperparameter Ranges\n", - "\n", - "To specify custom hyperparameter ranges, set the `custom_hyperparameters` parameter to be a dictionary where each key-value pair consists of a parameter name and range. AutoML will use this dictionary to override the hyperparameter ranges collected from each component in the component graph.\n", - "\n", - "If the hyperparameter ranges are categorical values, they can be passed in as `skopt.space.Categorical` values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from skopt.space import Categorical\n", - "\n", - "component_graph = ['Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier']\n", - "custom_hyperparameters = {\n", - " 'Imputer' : {\n", - " 'numeric_impute_strategy': Categorical(['most_frequent'])\n", - " }\n", - "}\n", - "\n", - "print(\"Without custom hyperparameters:\")\n", - "print(MulticlassClassificationPipeline(component_graph=component_graph).hyperparameters)\n", - "print()\n", - "print(\"With custom hyperparameters:\")\n", - "print(MulticlassClassificationPipeline(component_graph=component_graph,\n", - " custom_hyperparameters=custom_hyperparameters).hyperparameters)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -476,14 +442,6 @@ " 'Text Featurization Component',\n", " 'One Hot Encoder', 'Random Forest Classifier'],\n", " custom_name=\"Pipeline with Custom Component\",\n", - " custom_hyperparameters={\n", - " \"Imputer\": {\n", - " \"numeric_impute_strategy\": ['mean', 'median']\n", - " },\n", - " \"Random Forest Classifier\": {\n", - " \"n_estimators\": Integer(50, 100)\n", - " }\n", - " },\n", " random_seed=20)\n", "\n", "code = generate_pipeline_code(pipeline_instance)\n", @@ -496,6 +454,13 @@ "exec(code)\n", "pipeline.fit(X, y)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -514,9 +479,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 39e434eab3..c17edaedab 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -232,7 +232,7 @@ def __init__(self, } custom_hyperparameters (dict): A dict of the hyperparameter ranges used to iterate over during search. - Keys should consist of the component names and values should specify an skopt.Space. + Keys should consist of the component names and values should specify a singular value or skopt.Space. e.g. custom_hyperparameters = { 'Imputer' : { 'numeric_impute_strategy': Categorical(['most_frequent', 'median']) From 01dcd818db298894c1a3e7fb35916d74b1d3b1f6 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 1 Jun 2021 10:14:16 -0400 Subject: [PATCH 40/85] docs update --- docs/source/user_guide/automl.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index 462450bded..c715651e9b 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -472,7 +472,7 @@ "metadata": {}, "source": [ "## Limiting the AutoML Search Space\n", - "The AutoML search algorithm first trains each component in the pipeline with their default values. After the first iteration, it then tweaks the parameters of these components using the pre-defined hyperparameter ranges that these components have. To limit the search over certain hyperparameter ranges, you can specify a `pipeline_parameters` argument with your pipeline parameters. These parameters will also limit the hyperparameter search space. Hyperparameter ranges can be found through the [API reference](https://evalml.alteryx.com/en/stable/api_reference.html) for each component. Parameter arguments must be specified as dictionaries, but the associated values can be single values or `skopt.space` Real, Integer, Categorical values. " + "The AutoML search algorithm first trains each component in the pipeline with their default values. After the first iteration, it then tweaks the parameters of these components using the pre-defined hyperparameter ranges that these components have. To limit the search over certain hyperparameter ranges, you can specify a `custom_hyperparameters` argument with your `AutoMLSearch` parameters. These parameters will limit the hyperparameter search space. Hyperparameter ranges can be found through the [API reference](https://evalml.alteryx.com/en/stable/api_reference.html) for each component. Parameter arguments must be specified as dictionaries, but the associated values can be single values or `skopt.space` Real, Integer, Categorical values. " ] }, { @@ -490,19 +490,19 @@ "X, y = load_fraud(n_rows=1000)\n", "\n", "# example of setting parameter to just one value\n", - "pipeline_hyperparameters = {'Imputer': {\n", + "custom_hyperparameters = {'Imputer': {\n", " 'numeric_impute_strategy': 'mean'\n", "}}\n", "\n", "\n", "# limit the numeric impute strategy to include only `median` and `most_frequent`\n", "# `mean` is the default value for this argument, but it doesn't need to be included in the specified hyperparameter range for this to work\n", - "pipeline_hyperparameters = {'Imputer': {\n", + "custom_hyperparameters = {'Imputer': {\n", " 'numeric_impute_strategy': Categorical(['median', 'most_frequent'])\n", "}}\n", "\n", - "# using this pipeline parameter means that our Imputer components in the pipelines will only search through 'median' and 'most_frequent' stretegies for 'numeric_impute_strategy'\n", - "automl_constrained = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', pipeline_parameters=pipeline_hyperparameters)" + "# using this custom hyperparameter means that our Imputer components in these pipelines will only search through 'median' and 'most_frequent' strategies for 'numeric_impute_strategy'\n", + "automl_constrained = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', custom_hyperparameters=custom_hyperparameters)" ] }, { From 52522adbbcca1a83aa3035f95c7a69cf020a2f30 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 1 Jun 2021 10:29:12 -0400 Subject: [PATCH 41/85] update version in docs --- docs/source/user_guide/automl.ipynb | 2 +- docs/source/user_guide/components.ipynb | 2 +- docs/source/user_guide/data_checks.ipynb | 2 +- docs/source/user_guide/faq.ipynb | 2 +- docs/source/user_guide/model_understanding.ipynb | 2 +- docs/source/user_guide/objectives.ipynb | 2 +- docs/source/user_guide/pipelines.ipynb | 2 +- docs/source/user_guide/utilities.ipynb | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index c715651e9b..153c5dfea9 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -647,7 +647,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/components.ipynb b/docs/source/user_guide/components.ipynb index ebf591e5fa..8f0db48daa 100644 --- a/docs/source/user_guide/components.ipynb +++ b/docs/source/user_guide/components.ipynb @@ -480,7 +480,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/data_checks.ipynb b/docs/source/user_guide/data_checks.ipynb index 501c3f7a39..7a6bbf9664 100644 --- a/docs/source/user_guide/data_checks.ipynb +++ b/docs/source/user_guide/data_checks.ipynb @@ -584,7 +584,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/faq.ipynb b/docs/source/user_guide/faq.ipynb index 70319763a1..bfed4423af 100644 --- a/docs/source/user_guide/faq.ipynb +++ b/docs/source/user_guide/faq.ipynb @@ -57,7 +57,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/model_understanding.ipynb b/docs/source/user_guide/model_understanding.ipynb index 1eb52bea4a..48898cdbd0 100644 --- a/docs/source/user_guide/model_understanding.ipynb +++ b/docs/source/user_guide/model_understanding.ipynb @@ -576,7 +576,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/objectives.ipynb b/docs/source/user_guide/objectives.ipynb index 5e6bb78e89..10136e5b32 100644 --- a/docs/source/user_guide/objectives.ipynb +++ b/docs/source/user_guide/objectives.ipynb @@ -231,7 +231,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/pipelines.ipynb b/docs/source/user_guide/pipelines.ipynb index 5c5069d586..1dc7de24da 100644 --- a/docs/source/user_guide/pipelines.ipynb +++ b/docs/source/user_guide/pipelines.ipynb @@ -472,7 +472,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/utilities.ipynb b/docs/source/user_guide/utilities.ipynb index 050717c6f7..447abb516b 100644 --- a/docs/source/user_guide/utilities.ipynb +++ b/docs/source/user_guide/utilities.ipynb @@ -46,7 +46,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 3.8.6 }, "file_extension": ".py", "mimetype": "text/x-python", From cd32f60a306eae6f6f9857cf9a3dde264fcbeaee Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 1 Jun 2021 10:50:57 -0400 Subject: [PATCH 42/85] docs updates --- docs/source/user_guide/automl.ipynb | 2 +- docs/source/user_guide/components.ipynb | 2 +- docs/source/user_guide/data_checks.ipynb | 2 +- docs/source/user_guide/faq.ipynb | 2 +- docs/source/user_guide/model_understanding.ipynb | 2 +- docs/source/user_guide/objectives.ipynb | 2 +- docs/source/user_guide/pipelines.ipynb | 11 ++--------- docs/source/user_guide/utilities.ipynb | 2 +- 8 files changed, 9 insertions(+), 16 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index 153c5dfea9..c715651e9b 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -647,7 +647,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/components.ipynb b/docs/source/user_guide/components.ipynb index 8f0db48daa..ebf591e5fa 100644 --- a/docs/source/user_guide/components.ipynb +++ b/docs/source/user_guide/components.ipynb @@ -480,7 +480,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/data_checks.ipynb b/docs/source/user_guide/data_checks.ipynb index 7a6bbf9664..501c3f7a39 100644 --- a/docs/source/user_guide/data_checks.ipynb +++ b/docs/source/user_guide/data_checks.ipynb @@ -584,7 +584,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/faq.ipynb b/docs/source/user_guide/faq.ipynb index bfed4423af..70319763a1 100644 --- a/docs/source/user_guide/faq.ipynb +++ b/docs/source/user_guide/faq.ipynb @@ -57,7 +57,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/model_understanding.ipynb b/docs/source/user_guide/model_understanding.ipynb index 48898cdbd0..1eb52bea4a 100644 --- a/docs/source/user_guide/model_understanding.ipynb +++ b/docs/source/user_guide/model_understanding.ipynb @@ -576,7 +576,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/objectives.ipynb b/docs/source/user_guide/objectives.ipynb index 10136e5b32..5e6bb78e89 100644 --- a/docs/source/user_guide/objectives.ipynb +++ b/docs/source/user_guide/objectives.ipynb @@ -231,7 +231,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", diff --git a/docs/source/user_guide/pipelines.ipynb b/docs/source/user_guide/pipelines.ipynb index 1dc7de24da..5203678714 100644 --- a/docs/source/user_guide/pipelines.ipynb +++ b/docs/source/user_guide/pipelines.ipynb @@ -454,13 +454,6 @@ "exec(code)\n", "pipeline.fit(X, y)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -472,14 +465,14 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.6" } }, "nbformat": 4, diff --git a/docs/source/user_guide/utilities.ipynb b/docs/source/user_guide/utilities.ipynb index 447abb516b..050717c6f7 100644 --- a/docs/source/user_guide/utilities.ipynb +++ b/docs/source/user_guide/utilities.ipynb @@ -46,7 +46,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.8.6 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", From 3f127790226bfd403e48605e3a1502cd31899c7a Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 1 Jun 2021 20:14:51 -0400 Subject: [PATCH 43/85] remove superfluous code --- evalml/automl/automl_algorithm/automl_algorithm.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index 36d5b6deae..a3eb8ab790 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -37,10 +37,6 @@ def __init__(self, self._tuners = {} for pipeline in self.allowed_pipelines: pipeline_hyperparameters = get_hyperparameter_ranges(pipeline.component_graph, custom_hyperparameters) - if custom_hyperparameters: - for comp_name in custom_hyperparameters.keys(): - if comp_name in pipeline.parameters.keys(): - pipeline_hyperparameters[comp_name].update(custom_hyperparameters[comp_name]) self._tuners[pipeline.name] = self._tuner_class(pipeline_hyperparameters, random_seed=self.random_seed) self._pipeline_number = 0 self._batch_number = 0 From 6a0045cbacbafc6eccbdef671fce9683336e734c Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 2 Jun 2021 10:52:06 -0400 Subject: [PATCH 44/85] test update --- evalml/tests/automl_tests/test_iterative_algorithm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 6d9bad0bb3..28886bc46a 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -326,9 +326,15 @@ def test_iterative_algorithm_custom_hyperparameters(parameters, hyperparameters, algo.add_result(score, pipeline, {"id": algo.pipeline_number}) # make sure that future batches remain in the hyperparam range + all_dummies = set() for i in range(1, 5): next_batch = algo.next_batch() + for p in next_batch: + dummy = p.parameters['Mock Classifier']['dummy_parameter'] + if dummy not in all_dummies: + all_dummies.add(dummy) assert all([p.parameters['Mock Classifier']['dummy_parameter'] in hyperparameters for p in next_batch]) + assert all_dummies == {1, 3, 4} if parameters == 1 else all_dummies == {2, 3, 4} def test_iterative_algorithm_frozen_parameters(): From c7471c07dc6a79729e91fcdc15558353a641cd3e Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 3 Jun 2021 11:38:46 -0400 Subject: [PATCH 45/85] prints --- evalml/automl/automl_algorithm/iterative_algorithm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index acf0bc64a6..6edd7f2954 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -124,6 +124,7 @@ def next_batch(self): pipeline = self._first_batch_results[idx][1] for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() + print(f"iterativealgorothm - next_batch - proposed_parameters: {proposed_parameters}") parameters = self._combine_parameters(pipeline, proposed_parameters) next_batch.append(pipeline.new(parameters=parameters, random_seed=self.random_seed)) self._pipeline_number += len(next_batch) @@ -173,9 +174,11 @@ def _transform_parameters(self, pipeline, proposed_parameters): for name, component_class in pipeline.linearized_component_graph: component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters + print(f"iterativealgorothm - _transform_parameters - init_params: {init_params}") # For first batch, pass the pipeline params to the components that need them if name in self._custom_hyperparameters and self._batch_number == 0: for param_name, value in self._custom_hyperparameters[name].items(): + print(f"iterativealgorothm - _transform_parameters - hyperparameter name/param_name/value: {name}/{param_name}/{value}") if isinstance(value, (Integer, Real)): # get a random value in the space component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] @@ -185,6 +188,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): + print(f"iterativealgorothm - _transform_parameters - pipeline name/param_name/value: {name}/{param_name}/{value}") if isinstance(value, (Integer, Real, Categorical)): raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " "to custom_hyperparameters instead!") From 67441c0418bc8959d8581262a6e7b0437d356cc3 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 11:21:45 -0400 Subject: [PATCH 46/85] clean nitpick --- evalml/automl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 03bb631750..e448ae3b93 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -172,6 +172,6 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): for component_name, component_class in linearized_component_graph: component_hyperparameters = copy.copy(component_class.hyperparameter_ranges) if custom_hyperparameters and component_name in custom_hyperparameters: - component_hyperparameters.update(custom_hyperparameters.get(component_name, {})) + component_hyperparameters.update(custom_hyperparameters[component_name]) hyperparameter_ranges[component_name] = component_hyperparameters return hyperparameter_ranges From 23b4dd3d8ecddecd96f1e09cc75f93554561a5c3 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 12:19:10 -0400 Subject: [PATCH 47/85] Reformattings --- Makefile | 7 +- dev-requirements.txt | 2 +- evalml/__init__.py | 3 +- evalml/automl/__init__.py | 6 +- .../automl_algorithm/automl_algorithm.py | 27 +- .../automl_algorithm/iterative_algorithm.py | 215 +- evalml/automl/automl_search.py | 771 +++- evalml/automl/callbacks.py | 22 +- evalml/automl/engine/__init__.py | 8 +- evalml/automl/engine/dask_engine.py | 56 +- evalml/automl/engine/engine_base.py | 155 +- evalml/automl/engine/sequential_engine.py | 45 +- evalml/automl/pipeline_search_plots.py | 57 +- evalml/automl/utils.py | 113 +- .../data_checks/class_imbalance_data_check.py | 68 +- evalml/data_checks/data_check_action.py | 8 +- evalml/data_checks/data_check_message.py | 14 +- evalml/data_checks/data_check_message_code.py | 4 +- evalml/data_checks/data_check_message_type.py | 1 + evalml/data_checks/data_checks.py | 41 +- evalml/data_checks/datetime_nan_data_check.py | 25 +- evalml/data_checks/default_data_checks.py | 43 +- evalml/data_checks/highly_null_data_check.py | 72 +- evalml/data_checks/id_columns_data_check.py | 76 +- .../data_checks/invalid_targets_data_check.py | 240 +- .../multicollinearity_data_check.py | 36 +- .../natural_language_nan_data_check.py | 29 +- evalml/data_checks/no_variance_data_check.py | 47 +- evalml/data_checks/outliers_data_check.py | 66 +- evalml/data_checks/sparsity_data_check.py | 42 +- .../data_checks/target_leakage_data_check.py | 63 +- evalml/data_checks/uniqueness_data_check.py | 84 +- evalml/demos/churn.py | 12 +- evalml/demos/fraud.py | 8 +- evalml/exceptions/__init__.py | 3 +- evalml/exceptions/exceptions.py | 12 +- evalml/model_family/model_family.py | 64 +- evalml/model_family/utils.py | 4 +- evalml/model_understanding/__init__.py | 9 +- evalml/model_understanding/graphs.py | 824 +++- .../permutation_importance.py | 239 +- .../prediction_explanations/_algorithms.py | 43 +- .../_report_creator_factory.py | 46 +- .../_user_interface.py | 521 ++- .../prediction_explanations/explainers.py | 170 +- evalml/objectives/__init__.py | 15 +- .../binary_classification_objective.py | 8 +- evalml/objectives/cost_benefit_matrix.py | 20 +- evalml/objectives/fraud_cost.py | 17 +- evalml/objectives/lead_scoring.py | 13 +- evalml/objectives/objective_base.py | 30 +- evalml/objectives/sensitivity_low_alert.py | 4 +- evalml/objectives/standard_metrics.py | 86 +- evalml/objectives/utils.py | 48 +- evalml/pipelines/__init__.py | 4 +- .../binary_classification_pipeline.py | 16 +- .../binary_classification_pipeline_mixin.py | 15 +- evalml/pipelines/classification_pipeline.py | 48 +- evalml/pipelines/component_graph.py | 167 +- evalml/pipelines/components/__init__.py | 9 +- evalml/pipelines/components/component_base.py | 35 +- .../components/component_base_meta.py | 12 +- .../ensemble/stacked_ensemble_base.py | 65 +- .../ensemble/stacked_ensemble_classifier.py | 30 +- .../ensemble/stacked_ensemble_regressor.py | 27 +- .../components/estimators/__init__.py | 50 +- .../classifiers/baseline_classifier.py | 33 +- .../classifiers/catboost_classifier.py | 59 +- .../classifiers/decision_tree_classifier.py | 50 +- .../classifiers/elasticnet_classifier.py | 51 +- .../estimators/classifiers/et_classifier.py | 54 +- .../classifiers/kneighbors_classifier.py | 47 +- .../classifiers/lightgbm_classifier.py | 89 +- .../classifiers/logistic_regression.py | 43 +- .../estimators/classifiers/rf_classifier.py | 30 +- .../estimators/classifiers/svm_classifier.py | 48 +- .../classifiers/xgboost_classifier.py | 53 +- .../components/estimators/estimator.py | 25 +- .../estimators/regressors/arima_regressor.py | 106 +- .../regressors/baseline_regressor.py | 16 +- .../regressors/catboost_regressor.py | 54 +- .../regressors/decision_tree_regressor.py | 47 +- .../regressors/elasticnet_regressor.py | 35 +- .../estimators/regressors/et_regressor.py | 51 +- .../regressors/lightgbm_regressor.py | 71 +- .../estimators/regressors/linear_regressor.py | 29 +- .../estimators/regressors/rf_regressor.py | 27 +- .../estimators/regressors/svm_regressor.py | 27 +- .../time_series_baseline_estimator.py | 28 +- .../regressors/xgboost_regressor.py | 50 +- .../components/transformers/__init__.py | 16 +- .../transformers/column_selectors.py | 17 +- .../dimensionality_reduction/lda.py | 29 +- .../dimensionality_reduction/pca.py | 29 +- .../transformers/encoders/onehot_encoder.py | 131 +- .../transformers/encoders/target_encoder.py | 68 +- .../feature_selection/feature_selector.py | 22 +- .../rf_classifier_feature_selector.py | 66 +- .../rf_regressor_feature_selector.py | 66 +- .../transformers/imputers/imputer.py | 75 +- .../imputers/per_column_imputer.py | 41 +- .../transformers/imputers/simple_imputer.py | 38 +- .../transformers/imputers/target_imputer.py | 37 +- .../preprocessing/datetime_featurizer.py | 89 +- .../delayed_feature_transformer.py | 39 +- .../preprocessing/drop_null_columns.py | 11 +- .../preprocessing/featuretools.py | 27 +- .../transformers/preprocessing/lsa.py | 22 +- .../preprocessing/polynomial_detrender.py | 15 +- .../preprocessing/text_featurizer.py | 48 +- .../preprocessing/text_transformer.py | 8 +- .../transformers/samplers/base_sampler.py | 59 +- .../transformers/samplers/oversamplers.py | 71 +- .../transformers/samplers/undersampler.py | 41 +- .../transformers/scalers/standard_scaler.py | 17 +- .../components/transformers/transformer.py | 14 +- evalml/pipelines/components/utils.py | 78 +- .../multiclass_classification_pipeline.py | 1 + evalml/pipelines/pipeline_base.py | 277 +- evalml/pipelines/pipeline_meta.py | 26 +- evalml/pipelines/regression_pipeline.py | 6 +- .../time_series_classification_pipelines.py | 92 +- .../time_series_regression_pipeline.py | 47 +- evalml/pipelines/utils.py | 143 +- evalml/preprocessing/__init__.py | 2 +- .../balanced_classification_sampler.py | 33 +- .../data_splitters/time_series_split.py | 20 +- .../training_validation_split.py | 28 +- evalml/preprocessing/utils.py | 44 +- evalml/problem_types/__init__.py | 10 +- evalml/problem_types/problem_types.py | 27 +- evalml/problem_types/utils.py | 46 +- evalml/tests/automl_tests/dask_test_utils.py | 100 +- evalml/tests/automl_tests/test_automl.py | 3982 ++++++++++++----- .../automl_tests/test_automl_algorithm.py | 17 +- evalml/tests/automl_tests/test_automl_dask.py | 111 +- .../test_automl_search_classification.py | 1268 ++++-- .../test_automl_search_regression.py | 344 +- .../tests/automl_tests/test_automl_utils.py | 268 +- evalml/tests/automl_tests/test_dask_engine.py | 220 +- evalml/tests/automl_tests/test_engine_base.py | 128 +- .../automl_tests/test_iterative_algorithm.py | 584 ++- .../test_pipeline_search_plots.py | 46 +- evalml/tests/automl_tests/test_search.py | 38 +- .../automl_tests/test_time_series_split.py | 56 +- .../component_tests/test_arima_regressor.py | 86 +- .../test_baseline_classifier.py | 55 +- .../test_catboost_classifier.py | 10 +- .../test_catboost_regressor.py | 10 +- .../test_column_selector_transformers.py | 101 +- .../tests/component_tests/test_components.py | 904 +++- .../test_datetime_featurizer.py | 289 +- .../test_decision_tree_classifier.py | 15 +- .../test_decision_tree_regressor.py | 10 +- .../test_delayed_features_transformer.py | 542 ++- .../test_drop_null_columns_transformer.py | 166 +- .../component_tests/test_en_classifier.py | 81 +- .../component_tests/test_en_regressor.py | 30 +- .../tests/component_tests/test_estimators.py | 177 +- .../component_tests/test_et_classifier.py | 9 +- .../component_tests/test_et_regressor.py | 6 +- .../component_tests/test_feature_selectors.py | 58 +- .../component_tests/test_featuretools.py | 66 +- evalml/tests/component_tests/test_imputer.py | 404 +- .../component_tests/test_knn_classifier.py | 9 +- evalml/tests/component_tests/test_lda.py | 164 +- .../component_tests/test_lgbm_classifier.py | 108 +- .../component_tests/test_lgbm_regressor.py | 70 +- evalml/tests/component_tests/test_lsa.py | 173 +- .../component_tests/test_one_hot_encoder.py | 598 ++- .../component_tests/test_oversamplers.py | 184 +- evalml/tests/component_tests/test_pca.py | 143 +- .../test_per_column_imputer.py | 224 +- .../test_polynomial_detrender.py | 26 +- .../component_tests/test_simple_imputer.py | 366 +- .../test_stacked_ensemble_classifier.py | 136 +- .../test_stacked_ensemble_regressor.py | 119 +- .../component_tests/test_standard_scaler.py | 11 +- .../component_tests/test_svm_classifier.py | 13 +- .../component_tests/test_svm_regressor.py | 9 +- .../component_tests/test_target_encoder.py | 217 +- .../component_tests/test_target_imputer.py | 123 +- .../component_tests/test_text_featurizer.py | 358 +- .../test_time_series_baseline_estimators.py | 2 +- .../component_tests/test_undersampler.py | 34 +- evalml/tests/component_tests/test_utils.py | 56 +- .../test_xgboost_classifier.py | 18 +- .../component_tests/test_xgboost_regressor.py | 18 +- evalml/tests/conftest.py | 484 +- .../test_class_imbalance_data_check.py | 521 ++- .../data_checks_tests/test_data_check.py | 20 +- .../test_data_check_action.py | 39 +- .../test_data_check_message.py | 109 +- .../data_checks_tests/test_data_checks.py | 724 ++- .../test_datetime_nan_data_check.py | 115 +- .../test_highly_null_data_check.py | 229 +- .../test_id_columns_data_check.py | 335 +- .../test_invalid_targets_data_check.py | 730 ++- .../test_multicollinearity_data_check.py | 103 +- .../test_natural_language_nan_data_check.py | 155 +- .../test_no_variance_data_check.py | 251 +- .../test_outliers_data_check.py | 100 +- .../test_sparsity_data_check.py | 154 +- .../test_target_leakage_data_check.py | 601 ++- .../test_uniqueness_data_check.py | 102 +- .../model_family_tests/test_model_family.py | 45 +- .../test_algorithms.py | 240 +- .../test_explainers.py | 1527 +++++-- .../test_user_interface.py | 608 ++- .../model_understanding_tests/test_graphs.py | 1077 +++-- .../test_partial_dependence.py | 885 ++-- .../test_permutation_importance.py | 554 ++- .../test_binary_classification_objective.py | 66 +- .../test_cost_benefit_matrix.py | 103 +- .../objective_tests/test_fraud_detection.py | 64 +- .../objective_tests/test_lead_scoring.py | 43 +- .../tests/objective_tests/test_objectives.py | 62 +- evalml/tests/objective_tests/test_sla.py | 35 +- .../objective_tests/test_standard_metrics.py | 507 ++- .../test_binary_classification.py | 185 +- .../test_classification.py | 51 +- .../test_multiclass_classification.py | 81 +- .../test_regression.py | 103 +- .../pipeline_tests/test_component_graph.py | 1073 +++-- evalml/tests/pipeline_tests/test_graphs.py | 134 +- .../pipeline_tests/test_pipeline_utils.py | 643 ++- evalml/tests/pipeline_tests/test_pipelines.py | 1790 +++++--- .../test_time_series_pipeline.py | 519 ++- .../test_balanced_classification_sampler.py | 165 +- .../preprocessing_tests/test_drop_na_rows.py | 9 +- .../preprocessing_tests/test_split_data.py | 19 +- .../test_training_validation_split.py | 26 +- .../problem_type_tests/test_problem_types.py | 83 +- evalml/tests/test_all_test_dirs_included.py | 10 +- .../tuner_tests/test_grid_search_tuner.py | 48 +- .../tuner_tests/test_random_search_tuner.py | 75 +- evalml/tests/tuner_tests/test_skopt_tuner.py | 300 +- evalml/tests/utils_tests/test_cli_utils.py | 36 +- evalml/tests/utils_tests/test_dependencies.py | 52 +- evalml/tests/utils_tests/test_gen_utils.py | 624 ++- evalml/tests/utils_tests/test_logger.py | 137 +- .../tests/utils_tests/test_woodwork_utils.py | 103 +- evalml/tuners/grid_search_tuner.py | 12 +- evalml/tuners/random_search_tuner.py | 14 +- evalml/tuners/skopt_tuner.py | 23 +- evalml/tuners/tuner.py | 62 +- evalml/tuners/tuner_exceptions.py | 2 + evalml/utils/__init__.py | 4 +- evalml/utils/base_meta.py | 19 +- evalml/utils/cli_utils.py | 38 +- evalml/utils/gen_utils.py | 162 +- evalml/utils/logger.py | 21 +- evalml/utils/woodwork_utils.py | 49 +- 253 files changed, 26746 insertions(+), 12093 deletions(-) diff --git a/Makefile b/Makefile index cbf73e8d51..0f77beac25 100644 --- a/Makefile +++ b/Makefile @@ -8,12 +8,13 @@ clean: .PHONY: lint lint: - flake8 evalml && isort --check-only --recursive evalml && python docs/notebook_version_standardizer.py check-versions + flake8 evalml && isort --check-only evalml && python docs/notebook_version_standardizer.py check-versions + black evalml --check .PHONY: lint-fix lint-fix: - autopep8 --in-place --recursive --max-line-length=100 --select="E225,E222,E303,E261,E241,E302,E203,E128,E231,E251,E271,E127,E126,E301,W291,W293,E226,E306,E221" evalml - isort --recursive evalml + black evalml + isort evalml python docs/notebook_version_standardizer.py standardize .PHONY: test diff --git a/dev-requirements.txt b/dev-requirements.txt index cef7c37ce6..c100d5bc01 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,5 +2,5 @@ -r test-requirements.txt -r docs-requirements.txt flake8==3.7.0 -autopep8==1.4.3 +black==21.5b1 isort==4.3.4 diff --git a/evalml/__init__.py b/evalml/__init__.py index 75cdb0c946..4b4ae84179 100644 --- a/evalml/__init__.py +++ b/evalml/__init__.py @@ -13,6 +13,7 @@ import evalml.data_checks from evalml.automl import AutoMLSearch, search from evalml.utils import print_info + with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) warnings.simplefilter("ignore", DeprecationWarning) @@ -20,4 +21,4 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=DeprecationWarning) -__version__ = '0.25.0' +__version__ = "0.25.0" diff --git a/evalml/automl/__init__.py b/evalml/automl/__init__.py index cfeebcefed..ccfd30dcb5 100644 --- a/evalml/automl/__init__.py +++ b/evalml/automl/__init__.py @@ -1,3 +1,7 @@ from .automl_search import AutoMLSearch, search -from .utils import get_default_primary_search_objective, make_data_splitter, tune_binary_threshold +from .utils import ( + get_default_primary_search_objective, + make_data_splitter, + tune_binary_threshold, +) from .engine import SequentialEngine, EngineBase diff --git a/evalml/automl/automl_algorithm/automl_algorithm.py b/evalml/automl/automl_algorithm/automl_algorithm.py index a3eb8ab790..5be9b569f8 100644 --- a/evalml/automl/automl_algorithm/automl_algorithm.py +++ b/evalml/automl/automl_algorithm/automl_algorithm.py @@ -7,18 +7,21 @@ class AutoMLAlgorithmException(Exception): """Exception raised when an error is encountered during the computation of the automl algorithm""" + pass class AutoMLAlgorithm(ABC): """Base class for the automl algorithms which power evalml.""" - def __init__(self, - allowed_pipelines=None, - custom_hyperparameters=None, - max_iterations=None, - tuner_class=None, - random_seed=0): + def __init__( + self, + allowed_pipelines=None, + custom_hyperparameters=None, + max_iterations=None, + tuner_class=None, + random_seed=0, + ): """This class represents an automated machine learning (AutoML) algorithm. It encapsulates the decision-making logic behind an automl search, by both deciding which pipelines to evaluate next and by deciding what set of parameters to configure the pipeline with. To use this interface, you must define a next_batch method which returns the next group of pipelines to evaluate on the training data. That method may access state and results recorded from the previous batches, although that information is not tracked in a general way in this base class. Overriding add_result is a convenient way to record pipeline evaluation info if necessary. @@ -36,8 +39,12 @@ def __init__(self, self._tuner_class = tuner_class or SKOptTuner self._tuners = {} for pipeline in self.allowed_pipelines: - pipeline_hyperparameters = get_hyperparameter_ranges(pipeline.component_graph, custom_hyperparameters) - self._tuners[pipeline.name] = self._tuner_class(pipeline_hyperparameters, random_seed=self.random_seed) + pipeline_hyperparameters = get_hyperparameter_ranges( + pipeline.component_graph, custom_hyperparameters + ) + self._tuners[pipeline.name] = self._tuner_class( + pipeline_hyperparameters, random_seed=self.random_seed + ) self._pipeline_number = 0 self._batch_number = 0 @@ -58,7 +65,9 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): trained_pipeline_results (dict): Results from training a pipeline. """ if pipeline.name not in self._tuners: - raise PipelineNotFoundError(f"No such pipeline allowed in this AutoML search: {pipeline.name}") + raise PipelineNotFoundError( + f"No such pipeline allowed in this AutoML search: {pipeline.name}" + ) self._tuners[pipeline.name].add(pipeline.parameters, score_to_minimize) @property diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 6edd7f2954..53bbd6073d 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -17,27 +17,29 @@ ModelFamily.XGBOOST, ModelFamily.LIGHTGBM, ModelFamily.CATBOOST, - ModelFamily.ARIMA + ModelFamily.ARIMA, ] class IterativeAlgorithm(AutoMLAlgorithm): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance.""" - def __init__(self, - allowed_pipelines=None, - max_iterations=None, - tuner_class=None, - random_seed=0, - pipelines_per_batch=5, - n_jobs=-1, # TODO remove - number_features=None, # TODO remove - ensembling=False, - text_in_ensembling=False, - pipeline_params=None, - custom_hyperparameters=None, - _frozen_pipeline_parameters=None, - _estimator_family_order=None): + def __init__( + self, + allowed_pipelines=None, + max_iterations=None, + tuner_class=None, + random_seed=0, + pipelines_per_batch=5, + n_jobs=-1, # TODO remove + number_features=None, # TODO remove + ensembling=False, + text_in_ensembling=False, + pipeline_params=None, + custom_hyperparameters=None, + _frozen_pipeline_parameters=None, + _estimator_family_order=None, + ): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. Arguments: @@ -55,25 +57,35 @@ def __init__(self, _frozen_pipeline_parameters (dict or None): Pipeline-level parameters are frozen and used in the proposed pipelines. _estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to _ESTIMATOR_FAMILY_ORDER. """ - self._estimator_family_order = _estimator_family_order or _ESTIMATOR_FAMILY_ORDER + self._estimator_family_order = ( + _estimator_family_order or _ESTIMATOR_FAMILY_ORDER + ) indices = [] pipelines_to_sort = [] pipelines_end = [] for pipeline in allowed_pipelines or []: if pipeline.model_family in self._estimator_family_order: - indices.append(self._estimator_family_order.index(pipeline.model_family)) + indices.append( + self._estimator_family_order.index(pipeline.model_family) + ) pipelines_to_sort.append(pipeline) else: pipelines_end.append(pipeline) - pipelines_start = [pipeline for _, pipeline in (sorted(zip(indices, pipelines_to_sort), - key=lambda pair: pair[0]) or [])] + pipelines_start = [ + pipeline + for _, pipeline in ( + sorted(zip(indices, pipelines_to_sort), key=lambda pair: pair[0]) or [] + ) + ] allowed_pipelines = pipelines_start + pipelines_end - super().__init__(allowed_pipelines=allowed_pipelines, - custom_hyperparameters=custom_hyperparameters, - max_iterations=max_iterations, - tuner_class=tuner_class, - random_seed=random_seed) + super().__init__( + allowed_pipelines=allowed_pipelines, + custom_hyperparameters=custom_hyperparameters, + max_iterations=max_iterations, + tuner_class=tuner_class, + random_seed=random_seed, + ) self.pipelines_per_batch = pipelines_per_batch self.n_jobs = n_jobs self.number_features = number_features @@ -93,47 +105,73 @@ def next_batch(self): """ if self._batch_number == 1: if len(self._first_batch_results) == 0: - raise AutoMLAlgorithmException('No results were reported from the first batch') - self._first_batch_results = sorted(self._first_batch_results, key=itemgetter(0)) + raise AutoMLAlgorithmException( + "No results were reported from the first batch" + ) + self._first_batch_results = sorted( + self._first_batch_results, key=itemgetter(0) + ) next_batch = [] if self._batch_number == 0: - next_batch = [pipeline.new(parameters=self._combine_parameters(pipeline, {}), random_seed=self.random_seed) - for pipeline in self.allowed_pipelines] + next_batch = [ + pipeline.new( + parameters=self._combine_parameters(pipeline, {}), + random_seed=self.random_seed, + ) + for pipeline in self.allowed_pipelines + ] # One after training all pipelines one round - elif (self.ensembling and - self._batch_number != 1 and - (self._batch_number) % (len(self._first_batch_results) + 1) == 0): + elif ( + self.ensembling + and self._batch_number != 1 + and (self._batch_number) % (len(self._first_batch_results) + 1) == 0 + ): input_pipelines = [] for pipeline_dict in self._best_pipeline_info.values(): - pipeline = pipeline_dict['pipeline'] - pipeline_params = pipeline_dict['parameters'] + pipeline = pipeline_dict["pipeline"] + pipeline_params = pipeline_dict["parameters"] parameters = self._combine_parameters(pipeline, pipeline_params) - input_pipelines.append(pipeline.new(parameters=parameters, - random_seed=self.random_seed)) + input_pipelines.append( + pipeline.new(parameters=parameters, random_seed=self.random_seed) + ) n_jobs_ensemble = 1 if self.text_in_ensembling else self.n_jobs - ensemble = _make_stacked_ensemble_pipeline(input_pipelines, input_pipelines[0].problem_type, - random_seed=self.random_seed, - n_jobs=n_jobs_ensemble) + ensemble = _make_stacked_ensemble_pipeline( + input_pipelines, + input_pipelines[0].problem_type, + random_seed=self.random_seed, + n_jobs=n_jobs_ensemble, + ) next_batch.append(ensemble) else: - num_pipelines = (len(self._first_batch_results) + 1) if self.ensembling else len(self._first_batch_results) + num_pipelines = ( + (len(self._first_batch_results) + 1) + if self.ensembling + else len(self._first_batch_results) + ) idx = (self._batch_number - 1) % num_pipelines pipeline = self._first_batch_results[idx][1] for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() - print(f"iterativealgorothm - next_batch - proposed_parameters: {proposed_parameters}") + print( + f"iterativealgorothm - next_batch - proposed_parameters: {proposed_parameters}" + ) parameters = self._combine_parameters(pipeline, proposed_parameters) - next_batch.append(pipeline.new(parameters=parameters, random_seed=self.random_seed)) + next_batch.append( + pipeline.new(parameters=parameters, random_seed=self.random_seed) + ) self._pipeline_number += len(next_batch) self._batch_number += 1 return next_batch def _combine_parameters(self, pipeline, proposed_parameters): """Helper function for logic to transform proposed parameters and frozen parameters.""" - _returning = {**self._transform_parameters(pipeline, proposed_parameters), **self._frozen_pipeline_parameters} + _returning = { + **self._transform_parameters(pipeline, proposed_parameters), + **self._frozen_pipeline_parameters, + } return _returning def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): @@ -147,62 +185,97 @@ def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): if pipeline.model_family != ModelFamily.ENSEMBLE: if self.batch_number == 1: try: - super().add_result(score_to_minimize, pipeline, trained_pipeline_results) + super().add_result( + score_to_minimize, pipeline, trained_pipeline_results + ) except ValueError as e: - if 'is not within the bounds of the space' in str(e): - raise ValueError("Default parameters for components in pipeline {} not in the hyperparameter ranges: {}".format(pipeline.name, e)) + if "is not within the bounds of the space" in str(e): + raise ValueError( + "Default parameters for components in pipeline {} not in the hyperparameter ranges: {}".format( + pipeline.name, e + ) + ) else: - raise(e) + raise (e) else: - super().add_result(score_to_minimize, pipeline, trained_pipeline_results) + super().add_result( + score_to_minimize, pipeline, trained_pipeline_results + ) if self.batch_number == 1: self._first_batch_results.append((score_to_minimize, pipeline)) - current_best_score = self._best_pipeline_info.get(pipeline.model_family, {}).get('mean_cv_score', np.inf) - if score_to_minimize is not None and score_to_minimize < current_best_score and pipeline.model_family != ModelFamily.ENSEMBLE: - self._best_pipeline_info.update({pipeline.model_family: {'mean_cv_score': score_to_minimize, - 'pipeline': pipeline, - 'parameters': pipeline.parameters, - 'id': trained_pipeline_results['id']} - }) + current_best_score = self._best_pipeline_info.get( + pipeline.model_family, {} + ).get("mean_cv_score", np.inf) + if ( + score_to_minimize is not None + and score_to_minimize < current_best_score + and pipeline.model_family != ModelFamily.ENSEMBLE + ): + self._best_pipeline_info.update( + { + pipeline.model_family: { + "mean_cv_score": score_to_minimize, + "pipeline": pipeline, + "parameters": pipeline.parameters, + "id": trained_pipeline_results["id"], + } + } + ) def _transform_parameters(self, pipeline, proposed_parameters): """Given a pipeline parameters dict, make sure n_jobs and number_features are set.""" parameters = {} - if 'pipeline' in self._pipeline_params: - parameters['pipeline'] = self._pipeline_params['pipeline'] + if "pipeline" in self._pipeline_params: + parameters["pipeline"] = self._pipeline_params["pipeline"] for name, component_class in pipeline.linearized_component_graph: component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters - print(f"iterativealgorothm - _transform_parameters - init_params: {init_params}") + print( + f"iterativealgorothm - _transform_parameters - init_params: {init_params}" + ) # For first batch, pass the pipeline params to the components that need them if name in self._custom_hyperparameters and self._batch_number == 0: for param_name, value in self._custom_hyperparameters[name].items(): - print(f"iterativealgorothm - _transform_parameters - hyperparameter name/param_name/value: {name}/{param_name}/{value}") + print( + f"iterativealgorothm - _transform_parameters - hyperparameter name/param_name/value: {name}/{param_name}/{value}" + ) if isinstance(value, (Integer, Real)): # get a random value in the space - component_parameters[param_name] = value.rvs(random_state=self.random_seed)[0] + component_parameters[param_name] = value.rvs( + random_state=self.random_seed + )[0] elif isinstance(value, Categorical): - component_parameters[param_name] = value.rvs(random_state=self.random_seed) + component_parameters[param_name] = value.rvs( + random_state=self.random_seed + ) else: component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): - print(f"iterativealgorothm - _transform_parameters - pipeline name/param_name/value: {name}/{param_name}/{value}") + print( + f"iterativealgorothm - _transform_parameters - pipeline name/param_name/value: {name}/{param_name}/{value}" + ) if isinstance(value, (Integer, Real, Categorical)): - raise ValueError("Pipeline parameters should not contain skopt.Space variables, please pass them " - "to custom_hyperparameters instead!") + raise ValueError( + "Pipeline parameters should not contain skopt.Space variables, please pass them " + "to custom_hyperparameters instead!" + ) else: component_parameters[param_name] = value # Inspects each component and adds the following parameters when needed - if 'n_jobs' in init_params: - component_parameters['n_jobs'] = self.n_jobs - if 'number_features' in init_params: - component_parameters['number_features'] = self.number_features - if name in self._pipeline_params and name == 'Drop Columns Transformer' and self._batch_number > 0: - component_parameters['columns'] = self._pipeline_params[name]['columns'] - if 'pipeline' in self._pipeline_params: - for param_name, value in self._pipeline_params['pipeline'].items(): + if "n_jobs" in init_params: + component_parameters["n_jobs"] = self.n_jobs + if "number_features" in init_params: + component_parameters["number_features"] = self.number_features + if ( + name in self._pipeline_params + and name == "Drop Columns Transformer" + and self._batch_number > 0 + ): + component_parameters["columns"] = self._pipeline_params[name]["columns"] + if "pipeline" in self._pipeline_params: + for param_name, value in self._pipeline_params["pipeline"].items(): if param_name in init_params: component_parameters[param_name] = value parameters[name] = component_parameters diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index c17edaedab..ca5a664ed8 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -20,19 +20,19 @@ check_all_pipeline_names_unique, get_best_sampler_for_data, get_default_primary_search_objective, - make_data_splitter + make_data_splitter, ) from evalml.data_checks import DefaultDataChecks from evalml.exceptions import ( AutoMLSearchException, PipelineNotFoundError, - PipelineScoreError + PipelineScoreError, ) from evalml.model_family import ModelFamily from evalml.objectives import ( get_core_objectives, get_non_core_objectives, - get_objective + get_objective, ) from evalml.pipelines import ( BinaryClassificationPipeline, @@ -41,7 +41,7 @@ RegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, - TimeSeriesRegressionPipeline + TimeSeriesRegressionPipeline, ) from evalml.pipelines.components.utils import get_estimators from evalml.pipelines.utils import make_pipeline @@ -49,21 +49,16 @@ ProblemTypes, handle_problem_types, is_classification, - is_time_series + is_time_series, ) from evalml.tuners import SKOptTuner from evalml.utils import convert_to_seconds, infer_feature_types -from evalml.utils.logger import ( - get_logger, - log_subtitle, - log_title, - time_elapsed -) +from evalml.utils.logger import get_logger, log_subtitle, log_title, time_elapsed logger = get_logger(__file__) -def search(X_train=None, y_train=None, problem_type=None, objective='auto', **kwargs): +def search(X_train=None, y_train=None, problem_type=None, objective="auto", **kwargs): """Given data and configuration, run an automl search. This method will run EvalML's default suite of data checks. If the data checks produce errors, the data check results will be returned before running the automl search. In that case we recommend you alter your data to address these errors and try again. @@ -92,17 +87,24 @@ def search(X_train=None, y_train=None, problem_type=None, objective='auto', **kw X_train = infer_feature_types(X_train) y_train = infer_feature_types(y_train) problem_type = handle_problem_types(problem_type) - if objective == 'auto': + if objective == "auto": objective = get_default_primary_search_objective(problem_type) objective = get_objective(objective, return_instance=False) automl_config = kwargs - automl_config.update({'X_train': X_train, 'y_train': y_train, 'problem_type': problem_type, - 'objective': objective, 'max_batches': 1}) + automl_config.update( + { + "X_train": X_train, + "y_train": y_train, + "problem_type": problem_type, + "objective": objective, + "max_batches": 1, + } + ) data_checks = DefaultDataChecks(problem_type=problem_type, objective=objective) data_check_results = data_checks.validate(X_train, y=y_train) - if len(data_check_results.get('errors', [])): + if len(data_check_results.get("errors", [])): return None, data_check_results automl = AutoMLSearch(**automl_config) @@ -112,42 +114,45 @@ def search(X_train=None, y_train=None, problem_type=None, objective='auto', **kw class AutoMLSearch: """Automated Pipeline search.""" + _MAX_NAME_LEN = 40 # Necessary for "Plotting" documentation, since Sphinx does not work well with instance attributes. plot = PipelineSearchPlots - def __init__(self, - X_train=None, - y_train=None, - problem_type=None, - objective='auto', - max_iterations=None, - max_time=None, - patience=None, - tolerance=None, - data_splitter=None, - allowed_pipelines=None, - allowed_model_families=None, - start_iteration_callback=None, - add_result_callback=None, - error_callback=None, - additional_objectives=None, - random_seed=0, - n_jobs=-1, - tuner_class=None, - optimize_thresholds=True, - ensembling=False, - max_batches=None, - problem_configuration=None, - train_best_pipeline=True, - pipeline_parameters=None, - custom_hyperparameters=None, - sampler_method="auto", - sampler_balanced_ratio=0.25, - _ensembling_split_size=0.2, - _pipelines_per_batch=5, - engine=None): + def __init__( + self, + X_train=None, + y_train=None, + problem_type=None, + objective="auto", + max_iterations=None, + max_time=None, + patience=None, + tolerance=None, + data_splitter=None, + allowed_pipelines=None, + allowed_model_families=None, + start_iteration_callback=None, + add_result_callback=None, + error_callback=None, + additional_objectives=None, + random_seed=0, + n_jobs=-1, + tuner_class=None, + optimize_thresholds=True, + ensembling=False, + max_batches=None, + problem_configuration=None, + train_best_pipeline=True, + pipeline_parameters=None, + custom_hyperparameters=None, + sampler_method="auto", + sampler_balanced_ratio=0.25, + _ensembling_split_size=0.2, + _pipelines_per_batch=5, + engine=None, + ): """Automated pipeline search Arguments: @@ -255,17 +260,25 @@ def __init__(self, be used. """ if X_train is None: - raise ValueError('Must specify training data as a 2d array using the X_train argument') + raise ValueError( + "Must specify training data as a 2d array using the X_train argument" + ) if y_train is None: - raise ValueError('Must specify training data target values as a 1d vector using the y_train argument') + raise ValueError( + "Must specify training data target values as a 1d vector using the y_train argument" + ) try: self.problem_type = handle_problem_types(problem_type) except ValueError: - raise ValueError('choose one of (binary, multiclass, regression) as problem_type') + raise ValueError( + "choose one of (binary, multiclass, regression) as problem_type" + ) if is_time_series(self.problem_type): - warnings.warn("Time series support in evalml is still in beta, which means we are still actively building " - "its core features. Please be mindful of that when running search().") + warnings.warn( + "Time series support in evalml is still in beta, which means we are still actively building " + "its core features. Please be mindful of that when running search()." + ) self.tuner_class = tuner_class or SKOptTuner self.start_iteration_callback = start_iteration_callback @@ -274,35 +287,62 @@ def __init__(self, self.data_splitter = data_splitter self.optimize_thresholds = optimize_thresholds self.ensembling = ensembling - if objective == 'auto': + if objective == "auto": objective = get_default_primary_search_objective(self.problem_type.value) objective = get_objective(objective, return_instance=False) self.objective = self._validate_objective(objective) - if self.data_splitter is not None and not issubclass(self.data_splitter.__class__, BaseCrossValidator): + if self.data_splitter is not None and not issubclass( + self.data_splitter.__class__, BaseCrossValidator + ): raise ValueError("Not a valid data splitter") if not objective.is_defined_for_problem_type(self.problem_type): - raise ValueError("Given objective {} is not compatible with a {} problem.".format(self.objective.name, self.problem_type.value)) + raise ValueError( + "Given objective {} is not compatible with a {} problem.".format( + self.objective.name, self.problem_type.value + ) + ) if additional_objectives is None: additional_objectives = get_core_objectives(self.problem_type) # if our main objective is part of default set of objectives for problem_type, remove it - existing_main_objective = next((obj for obj in additional_objectives if obj.name == self.objective.name), None) + existing_main_objective = next( + ( + obj + for obj in additional_objectives + if obj.name == self.objective.name + ), + None, + ) if existing_main_objective is not None: additional_objectives.remove(existing_main_objective) else: additional_objectives = [get_objective(o) for o in additional_objectives] - additional_objectives = [self._validate_objective(obj) for obj in additional_objectives] + additional_objectives = [ + self._validate_objective(obj) for obj in additional_objectives + ] self.additional_objectives = additional_objectives - self.objective_name_to_class = {o.name: o for o in [self.objective] + self.additional_objectives} + self.objective_name_to_class = { + o.name: o for o in [self.objective] + self.additional_objectives + } if not isinstance(max_time, (int, float, str, type(None))): - raise TypeError(f"Parameter max_time must be a float, int, string or None. Received {type(max_time)} with value {str(max_time)}..") + raise TypeError( + f"Parameter max_time must be a float, int, string or None. Received {type(max_time)} with value {str(max_time)}.." + ) if isinstance(max_time, (int, float)) and max_time < 0: - raise ValueError(f"Parameter max_time must be None or non-negative. Received {max_time}.") + raise ValueError( + f"Parameter max_time must be None or non-negative. Received {max_time}." + ) if max_batches is not None and max_batches < 0: - raise ValueError(f"Parameter max_batches must be None or non-negative. Received {max_batches}.") + raise ValueError( + f"Parameter max_batches must be None or non-negative. Received {max_batches}." + ) if max_iterations is not None and max_iterations < 0: - raise ValueError(f"Parameter max_iterations must be None or non-negative. Received {max_iterations}.") - self.max_time = convert_to_seconds(max_time) if isinstance(max_time, str) else max_time + raise ValueError( + f"Parameter max_iterations must be None or non-negative. Received {max_iterations}." + ) + self.max_time = ( + convert_to_seconds(max_time) if isinstance(max_time, str) else max_time + ) self.max_iterations = max_iterations self.max_batches = max_batches self._pipelines_per_batch = _pipelines_per_batch @@ -311,17 +351,25 @@ def __init__(self, logger.info("Using default limit of max_batches=1.\n") if patience and (not isinstance(patience, int) or patience < 0): - raise ValueError("patience value must be a positive integer. Received {} instead".format(patience)) + raise ValueError( + "patience value must be a positive integer. Received {} instead".format( + patience + ) + ) if tolerance and (tolerance > 1.0 or tolerance < 0.0): - raise ValueError("tolerance value must be a float between 0.0 and 1.0 inclusive. Received {} instead".format(tolerance)) + raise ValueError( + "tolerance value must be a float between 0.0 and 1.0 inclusive. Received {} instead".format( + tolerance + ) + ) self.patience = patience self.tolerance = tolerance or 0.0 self._results = { - 'pipeline_results': {}, - 'search_order': [], + "pipeline_results": {}, + "search_order": [], } self._pipelines_searched = dict() self.random_seed = random_seed @@ -331,12 +379,20 @@ def __init__(self, try: self.plot = PipelineSearchPlots(self) except ImportError: - logger.warning("Unable to import plotly; skipping pipeline search plotting\n") + logger.warning( + "Unable to import plotly; skipping pipeline search plotting\n" + ) if allowed_pipelines is not None and not isinstance(allowed_pipelines, list): - raise ValueError("Parameter allowed_pipelines must be either None or a list!") - if allowed_pipelines is not None and not all(isinstance(p, PipelineBase) for p in allowed_pipelines): - raise ValueError("Every element of allowed_pipelines must an instance of PipelineBase!") + raise ValueError( + "Parameter allowed_pipelines must be either None or a list!" + ) + if allowed_pipelines is not None and not all( + isinstance(p, PipelineBase) for p in allowed_pipelines + ): + raise ValueError( + "Every element of allowed_pipelines must an instance of PipelineBase!" + ) self.allowed_pipelines = allowed_pipelines self.allowed_model_families = allowed_model_families self._automl_algorithm = None @@ -345,7 +401,9 @@ def __init__(self, self.show_batch_output = False self._validate_problem_type() - self.problem_configuration = self._validate_problem_configuration(problem_configuration) + self.problem_configuration = self._validate_problem_configuration( + problem_configuration + ) self._train_best_pipeline = train_best_pipeline self._best_pipeline = None self._searched = False @@ -353,11 +411,22 @@ def __init__(self, self.X_train = infer_feature_types(X_train) self.y_train = infer_feature_types(y_train) - default_data_splitter = make_data_splitter(self.X_train, self.y_train, self.problem_type, self.problem_configuration, - n_splits=3, shuffle=True, random_seed=self.random_seed) + default_data_splitter = make_data_splitter( + self.X_train, + self.y_train, + self.problem_type, + self.problem_configuration, + n_splits=3, + shuffle=True, + random_seed=self.random_seed, + ) self.data_splitter = self.data_splitter or default_data_splitter - self.pipeline_parameters = pipeline_parameters if pipeline_parameters is not None else {} - self.custom_hyperparameters = custom_hyperparameters if custom_hyperparameters is not None else {} + self.pipeline_parameters = ( + pipeline_parameters if pipeline_parameters is not None else {} + ) + self.custom_hyperparameters = ( + custom_hyperparameters if custom_hyperparameters is not None else {} + ) self.search_iteration_plot = None self._interrupted = False self._frozen_pipeline_parameters = {} @@ -366,8 +435,10 @@ def __init__(self, custom_hyperparameters = copy.copy(self.custom_hyperparameters) if self.problem_configuration: - parameters.update({'pipeline': self.problem_configuration}) - self._frozen_pipeline_parameters.update({'pipeline': self.problem_configuration}) + parameters.update({"pipeline": self.problem_configuration}) + self._frozen_pipeline_parameters.update( + {"pipeline": self.problem_configuration} + ) self.sampler_method = sampler_method self.sampler_balanced_ratio = sampler_balanced_ratio @@ -375,23 +446,54 @@ def __init__(self, if is_classification(self.problem_type): self._sampler_name = self.sampler_method - if self.sampler_method in ['auto', 'Oversampler']: - self._sampler_name = get_best_sampler_for_data(self.X_train, self.y_train, self.sampler_method, self.sampler_balanced_ratio) + if self.sampler_method in ["auto", "Oversampler"]: + self._sampler_name = get_best_sampler_for_data( + self.X_train, + self.y_train, + self.sampler_method, + self.sampler_balanced_ratio, + ) if self._sampler_name not in parameters: - parameters[self._sampler_name] = {"sampling_ratio": self.sampler_balanced_ratio} + parameters[self._sampler_name] = { + "sampling_ratio": self.sampler_balanced_ratio + } else: - parameters[self._sampler_name].update({"sampling_ratio": self.sampler_balanced_ratio}) - self._frozen_pipeline_parameters[self._sampler_name] = parameters[self._sampler_name] + parameters[self._sampler_name].update( + {"sampling_ratio": self.sampler_balanced_ratio} + ) + self._frozen_pipeline_parameters[self._sampler_name] = parameters[ + self._sampler_name + ] if self.allowed_pipelines is None: logger.info("Generating pipelines to search over...") - allowed_estimators = get_estimators(self.problem_type, self.allowed_model_families) - logger.debug(f"allowed_estimators set to {[estimator.name for estimator in allowed_estimators]}") - drop_columns = self.pipeline_parameters['Drop Columns Transformer']['columns'] if 'Drop Columns Transformer' in self.pipeline_parameters else None - index_columns = list(self.X_train.ww.select('index').columns) + allowed_estimators = get_estimators( + self.problem_type, self.allowed_model_families + ) + logger.debug( + f"allowed_estimators set to {[estimator.name for estimator in allowed_estimators]}" + ) + drop_columns = ( + self.pipeline_parameters["Drop Columns Transformer"]["columns"] + if "Drop Columns Transformer" in self.pipeline_parameters + else None + ) + index_columns = list(self.X_train.ww.select("index").columns) if len(index_columns) > 0 and drop_columns is None: - self._frozen_pipeline_parameters['Drop Columns Transformer'] = {'columns': index_columns} - self.allowed_pipelines = [make_pipeline(self.X_train, self.y_train, estimator, self.problem_type, parameters=self._frozen_pipeline_parameters, sampler_name=self._sampler_name) for estimator in allowed_estimators] + self._frozen_pipeline_parameters["Drop Columns Transformer"] = { + "columns": index_columns + } + self.allowed_pipelines = [ + make_pipeline( + self.X_train, + self.y_train, + estimator, + self.problem_type, + parameters=self._frozen_pipeline_parameters, + sampler_name=self._sampler_name, + ) + for estimator in allowed_estimators + ] if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") @@ -400,19 +502,30 @@ def __init__(self, check_all_pipeline_names_unique(self.allowed_pipelines) run_ensembling = self.ensembling - text_in_ensembling = len(self.X_train.ww.select('natural_language').columns) > 0 + text_in_ensembling = len(self.X_train.ww.select("natural_language").columns) > 0 if run_ensembling and len(self.allowed_pipelines) == 1: - logger.warning("Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run.") + logger.warning( + "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." + ) run_ensembling = False if run_ensembling and self.max_iterations is not None: # Baseline + first batch + each pipeline iteration + 1 - first_ensembling_iteration = (1 + len(self.allowed_pipelines) + len(self.allowed_pipelines) * self._pipelines_per_batch + 1) + first_ensembling_iteration = ( + 1 + + len(self.allowed_pipelines) + + len(self.allowed_pipelines) * self._pipelines_per_batch + + 1 + ) if self.max_iterations < first_ensembling_iteration: run_ensembling = False - logger.warning(f"Ensembling is set to True, but max_iterations is too small, so ensembling will not run. Set max_iterations >= {first_ensembling_iteration} to run ensembling.") + logger.warning( + f"Ensembling is set to True, but max_iterations is too small, so ensembling will not run. Set max_iterations >= {first_ensembling_iteration} to run ensembling." + ) else: - logger.info(f"Ensembling will run at the {first_ensembling_iteration} iteration and every {len(self.allowed_pipelines) * self._pipelines_per_batch} iterations after that.") + logger.info( + f"Ensembling will run at the {first_ensembling_iteration} iteration and every {len(self.allowed_pipelines) * self._pipelines_per_batch} iterations after that." + ) if self.max_batches and self.max_iterations is None: self.show_batch_output = True @@ -421,30 +534,52 @@ def __init__(self, num_ensemble_batches = (self.max_batches - 1) // ensemble_nth_batch if num_ensemble_batches == 0: run_ensembling = False - logger.warning(f"Ensembling is set to True, but max_batches is too small, so ensembling will not run. Set max_batches >= {ensemble_nth_batch + 1} to run ensembling.") + logger.warning( + f"Ensembling is set to True, but max_batches is too small, so ensembling will not run. Set max_batches >= {ensemble_nth_batch + 1} to run ensembling." + ) else: - logger.info(f"Ensembling will run every {ensemble_nth_batch} batches.") - - self.max_iterations = (1 + len(self.allowed_pipelines) + - self._pipelines_per_batch * (self.max_batches - 1 - num_ensemble_batches) + - num_ensemble_batches) + logger.info( + f"Ensembling will run every {ensemble_nth_batch} batches." + ) + + self.max_iterations = ( + 1 + + len(self.allowed_pipelines) + + self._pipelines_per_batch + * (self.max_batches - 1 - num_ensemble_batches) + + num_ensemble_batches + ) else: - self.max_iterations = 1 + len(self.allowed_pipelines) + (self._pipelines_per_batch * (self.max_batches - 1)) + self.max_iterations = ( + 1 + + len(self.allowed_pipelines) + + (self._pipelines_per_batch * (self.max_batches - 1)) + ) if not engine: self._engine = SequentialEngine() else: self._engine = engine - self.automl_config = AutoMLConfig(self.data_splitter, self.problem_type, - self.objective, self.additional_objectives, self.optimize_thresholds, - self.error_callback, self.random_seed, - self.X_train.ww.schema, - self.y_train.ww.schema) + self.automl_config = AutoMLConfig( + self.data_splitter, + self.problem_type, + self.objective, + self.additional_objectives, + self.optimize_thresholds, + self.error_callback, + self.random_seed, + self.X_train.ww.schema, + self.y_train.ww.schema, + ) - self.allowed_model_families = list(set([p.model_family for p in (self.allowed_pipelines)])) + self.allowed_model_families = list( + set([p.model_family for p in (self.allowed_pipelines)]) + ) - logger.debug(f"allowed_pipelines set to {[pipeline.name for pipeline in self.allowed_pipelines]}") + logger.debug( + f"allowed_pipelines set to {[pipeline.name for pipeline in self.allowed_pipelines]}" + ) logger.debug(f"allowed_model_families set to {self.allowed_model_families}") self._automl_algorithm = IterativeAlgorithm( @@ -459,12 +594,15 @@ def __init__(self, text_in_ensembling=text_in_ensembling, pipeline_params=parameters, custom_hyperparameters=custom_hyperparameters, - _frozen_pipeline_parameters=self._frozen_pipeline_parameters + _frozen_pipeline_parameters=self._frozen_pipeline_parameters, ) def _get_batch_number(self): batch_number = 1 - if self._automl_algorithm is not None and self._automl_algorithm.batch_number > 0: + if ( + self._automl_algorithm is not None + and self._automl_algorithm.batch_number > 0 + ): batch_number = self._automl_algorithm.batch_number return batch_number @@ -476,16 +614,18 @@ def _validate_objective(self, objective): non_core_objectives = get_non_core_objectives() if isinstance(objective, type): if objective in non_core_objectives: - raise ValueError(f"{objective.name.lower()} is not allowed in AutoML! " - "Use evalml.objectives.utils.get_core_objective_names() " - "to get all objective names allowed in automl.") + raise ValueError( + f"{objective.name.lower()} is not allowed in AutoML! " + "Use evalml.objectives.utils.get_core_objective_names() " + "to get all objective names allowed in automl." + ) return objective() return objective def __str__(self): def _print_list(obj_list): - lines = sorted(['\t{}'.format(o.name) for o in obj_list]) - return '\n'.join(lines) + lines = sorted(["\t{}".format(o.name) for o in obj_list]) + return "\n".join(lines) def _get_funct_name(function): if callable(function): @@ -515,17 +655,23 @@ def _get_funct_name(function): rankings_desc = "" if not self.rankings.empty: - rankings_str = self.rankings.drop(['parameters'], axis='columns').to_string() + rankings_str = self.rankings.drop( + ["parameters"], axis="columns" + ).to_string() rankings_desc = f"\nSearch Results: \n{'='*20}\n{rankings_str}" return search_desc + rankings_desc def _validate_problem_configuration(self, problem_configuration=None): if self.problem_type in [ProblemTypes.TIME_SERIES_REGRESSION]: - required_parameters = {'date_index', 'gap', 'max_delay'} - if not problem_configuration or not all(p in problem_configuration for p in required_parameters): - raise ValueError("user_parameters must be a dict containing values for at least the date_index, gap, and max_delay " - f"parameters. Received {problem_configuration}.") + required_parameters = {"date_index", "gap", "max_delay"} + if not problem_configuration or not all( + p in problem_configuration for p in required_parameters + ): + raise ValueError( + "user_parameters must be a dict containing values for at least the date_index, gap, and max_delay " + f"parameters. Received {problem_configuration}." + ) return problem_configuration or {} def _handle_keyboard_interrupt(self): @@ -537,7 +683,11 @@ def _handle_keyboard_interrupt(self): leading_char = "\n" start_of_loop = time.time() while True: - choice = input(leading_char + "Do you really want to exit search (y/n)? ").strip().lower() + choice = ( + input(leading_char + "Do you really want to exit search (y/n)? ") + .strip() + .lower() + ) if choice == "y": logger.info("Exiting AutoMLSearch.") return True @@ -560,7 +710,9 @@ def search(self, show_iteration_plot=True): Disabled by default in non-Jupyter enviroments. """ if self._searched: - logger.info("AutoMLSearch.search() has already been run and will not run again on the same instance. Re-initialize AutoMLSearch to search again.") + logger.info( + "AutoMLSearch.search() has already been run and will not run again on the same instance. Re-initialize AutoMLSearch to search again." + ) return # don't show iteration plot outside of a jupyter notebook @@ -572,19 +724,35 @@ def search(self, show_iteration_plot=True): log_title(logger, "Beginning pipeline search") logger.info("Optimizing for %s. " % self.objective.name) - logger.info("{} score is better.\n".format('Greater' if self.objective.greater_is_better else 'Lower')) - logger.info(f"Using {self._engine.__class__.__name__} to train and score pipelines.") + logger.info( + "{} score is better.\n".format( + "Greater" if self.objective.greater_is_better else "Lower" + ) + ) + logger.info( + f"Using {self._engine.__class__.__name__} to train and score pipelines." + ) if self.max_batches is not None: - logger.info(f"Searching up to {self.max_batches} batches for a total of {self.max_iterations} pipelines. ") + logger.info( + f"Searching up to {self.max_batches} batches for a total of {self.max_iterations} pipelines. " + ) elif self.max_iterations is not None: logger.info("Searching up to %s pipelines. " % self.max_iterations) if self.max_time is not None: - logger.info("Will stop searching for new pipelines after %d seconds.\n" % self.max_time) - logger.info("Allowed model families: %s\n" % ", ".join([model.value for model in self.allowed_model_families])) + logger.info( + "Will stop searching for new pipelines after %d seconds.\n" + % self.max_time + ) + logger.info( + "Allowed model families: %s\n" + % ", ".join([model.value for model in self.allowed_model_families]) + ) self.search_iteration_plot = None if self.plot: - self.search_iteration_plot = self.plot.search_iteration_plot(interactive_plot=show_iteration_plot) + self.search_iteration_plot = self.plot.search_iteration_plot( + interactive_plot=show_iteration_plot + ) self._start = time.time() @@ -604,25 +772,35 @@ def search(self, show_iteration_plot=True): if not loop_interrupted: current_batch_pipelines = self._automl_algorithm.next_batch() except StopIteration: - logger.info('AutoML Algorithm out of recommendations, ending') + logger.info("AutoML Algorithm out of recommendations, ending") break try: new_pipeline_ids = [] log_title(logger, f"Evaluating Batch Number {self._get_batch_number()}") for pipeline in current_batch_pipelines: self._pre_evaluation_callback(pipeline) - computation = self._engine.submit_evaluation_job(self.automl_config, pipeline, self.X_train, self.y_train) + computation = self._engine.submit_evaluation_job( + self.automl_config, pipeline, self.X_train, self.y_train + ) computations.append(computation) current_computation_index = 0 while self._should_continue() and len(computations) > 0: computation = computations[current_computation_index] if computation.done(): evaluation = computation.get_result() - data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger") - pipeline_id = self._post_evaluation_callback(pipeline, data, job_log) + data, pipeline, job_log = ( + evaluation.get("scores"), + evaluation.get("pipeline"), + evaluation.get("logger"), + ) + pipeline_id = self._post_evaluation_callback( + pipeline, data, job_log + ) new_pipeline_ids.append(pipeline_id) computations.pop(current_computation_index) - current_computation_index = (current_computation_index + 1) % max(len(computations), 1) + current_computation_index = (current_computation_index + 1) % max( + len(computations), 1 + ) time.sleep(0.1) loop_interrupted = False except KeyboardInterrupt: @@ -633,10 +811,17 @@ def search(self, show_iteration_plot=True): computation.cancel() full_rankings = self.full_rankings - current_batch_idx = full_rankings['id'].isin(new_pipeline_ids) - current_batch_pipeline_scores = full_rankings[current_batch_idx]["mean_cv_score"] - if len(current_batch_pipeline_scores) and current_batch_pipeline_scores.isna().all(): - raise AutoMLSearchException(f"All pipelines in the current AutoML batch produced a score of np.nan on the primary objective {self.objective}.") + current_batch_idx = full_rankings["id"].isin(new_pipeline_ids) + current_batch_pipeline_scores = full_rankings[current_batch_idx][ + "mean_cv_score" + ] + if ( + len(current_batch_pipeline_scores) + and current_batch_pipeline_scores.isna().all() + ): + raise AutoMLSearchException( + f"All pipelines in the current AutoML batch produced a score of np.nan on the primary objective {self.objective}." + ) self.search_duration = time.time() - self._start elapsed_time = time_elapsed(self._start) @@ -649,7 +834,9 @@ def search(self, show_iteration_plot=True): best_pipeline = self.rankings.iloc[0] best_pipeline_name = best_pipeline["pipeline_name"] logger.info(f"Best pipeline: {best_pipeline_name}") - logger.info(f"Best pipeline {self.objective.name}: {best_pipeline['mean_cv_score']:3f}") + logger.info( + f"Best pipeline {self.objective.name}: {best_pipeline['mean_cv_score']:3f}" + ) self._searched = True def _find_best_pipeline(self): @@ -658,13 +845,17 @@ def _find_best_pipeline(self): if len(self.rankings) == 0: return best_pipeline = self.rankings.iloc[0] - if not (self._best_pipeline and self._best_pipeline == self.get_pipeline(best_pipeline['id'])): - best_pipeline = self.get_pipeline(best_pipeline['id']) + if not ( + self._best_pipeline + and self._best_pipeline == self.get_pipeline(best_pipeline["id"]) + ): + best_pipeline = self.get_pipeline(best_pipeline["id"]) if self._train_best_pipeline: X_train = self.X_train y_train = self.y_train - best_pipeline = self._engine.submit_training_job(self.automl_config, best_pipeline, - X_train, y_train).get_result() + best_pipeline = self._engine.submit_training_job( + self.automl_config, best_pipeline, X_train, y_train + ).get_result() self._best_pipeline = best_pipeline @@ -674,7 +865,7 @@ def _num_pipelines(self): Returns: int: the number of pipeline evaluations made in the search """ - return len(self._results['pipeline_results']) + return len(self._results["pipeline_results"]) def _should_continue(self): """Given the original stopping criterion and current state, should the search continue? @@ -698,57 +889,104 @@ def _should_continue(self): if self.patience is None or self.tolerance is None: return True - first_id = self._results['search_order'][0] - best_score = self._results['pipeline_results'][first_id]["mean_cv_score"] + first_id = self._results["search_order"][0] + best_score = self._results["pipeline_results"][first_id]["mean_cv_score"] num_without_improvement = 0 - for id in self._results['search_order'][1:]: - curr_score = self._results['pipeline_results'][id]["mean_cv_score"] - significant_change = abs((curr_score - best_score) / best_score) > self.tolerance - score_improved = curr_score > best_score if self.objective.greater_is_better else curr_score < best_score + for id in self._results["search_order"][1:]: + curr_score = self._results["pipeline_results"][id]["mean_cv_score"] + significant_change = ( + abs((curr_score - best_score) / best_score) > self.tolerance + ) + score_improved = ( + curr_score > best_score + if self.objective.greater_is_better + else curr_score < best_score + ) if score_improved and significant_change: best_score = curr_score num_without_improvement = 0 else: num_without_improvement += 1 if num_without_improvement >= self.patience: - logger.info("\n\n{} iterations without improvement. Stopping search early...".format(self.patience)) + logger.info( + "\n\n{} iterations without improvement. Stopping search early...".format( + self.patience + ) + ) return False return True def _validate_problem_type(self): for obj in self.additional_objectives: if not obj.is_defined_for_problem_type(self.problem_type): - raise ValueError("Additional objective {} is not compatible with a {} problem.".format(obj.name, self.problem_type.value)) + raise ValueError( + "Additional objective {} is not compatible with a {} problem.".format( + obj.name, self.problem_type.value + ) + ) for pipeline in self.allowed_pipelines or []: if pipeline.problem_type != self.problem_type: - raise ValueError("Given pipeline {} is not compatible with problem_type {}.".format(pipeline.name, self.problem_type.value)) + raise ValueError( + "Given pipeline {} is not compatible with problem_type {}.".format( + pipeline.name, self.problem_type.value + ) + ) def _get_baseline_pipeline(self): """Creates a baseline pipeline instance.""" if self.problem_type == ProblemTypes.BINARY: - baseline = BinaryClassificationPipeline(component_graph=["Baseline Classifier"], - custom_name="Mode Baseline Binary Classification Pipeline", - parameters={"Baseline Classifier": {"strategy": "mode"}}) + baseline = BinaryClassificationPipeline( + component_graph=["Baseline Classifier"], + custom_name="Mode Baseline Binary Classification Pipeline", + parameters={"Baseline Classifier": {"strategy": "mode"}}, + ) elif self.problem_type == ProblemTypes.MULTICLASS: - baseline = MulticlassClassificationPipeline(component_graph=["Baseline Classifier"], - custom_name="Mode Baseline Multiclass Classification Pipeline", - parameters={"Baseline Classifier": {"strategy": "mode"}}) + baseline = MulticlassClassificationPipeline( + component_graph=["Baseline Classifier"], + custom_name="Mode Baseline Multiclass Classification Pipeline", + parameters={"Baseline Classifier": {"strategy": "mode"}}, + ) elif self.problem_type == ProblemTypes.REGRESSION: - baseline = RegressionPipeline(component_graph=["Baseline Regressor"], - custom_name="Mean Baseline Regression Pipeline", - parameters={"Baseline Classifier": {"strategy": "mean"}}) + baseline = RegressionPipeline( + component_graph=["Baseline Regressor"], + custom_name="Mean Baseline Regression Pipeline", + parameters={"Baseline Classifier": {"strategy": "mean"}}, + ) else: - pipeline_class, pipeline_name = {ProblemTypes.TIME_SERIES_REGRESSION: (TimeSeriesRegressionPipeline, "Time Series Baseline Regression Pipeline"), - ProblemTypes.TIME_SERIES_MULTICLASS: (TimeSeriesMulticlassClassificationPipeline, "Time Series Baseline Multiclass Pipeline"), - ProblemTypes.TIME_SERIES_BINARY: (TimeSeriesBinaryClassificationPipeline, "Time Series Baseline Binary Pipeline")}[self.problem_type] - date_index = self.problem_configuration['date_index'] - gap = self.problem_configuration['gap'] - max_delay = self.problem_configuration['max_delay'] - baseline = pipeline_class(component_graph=["Time Series Baseline Estimator"], - custom_name=pipeline_name, - parameters={"pipeline": {"date_index": date_index, "gap": gap, "max_delay": max_delay}, - "Time Series Baseline Estimator": {"date_index": date_index, "gap": gap, "max_delay": max_delay}}) + pipeline_class, pipeline_name = { + ProblemTypes.TIME_SERIES_REGRESSION: ( + TimeSeriesRegressionPipeline, + "Time Series Baseline Regression Pipeline", + ), + ProblemTypes.TIME_SERIES_MULTICLASS: ( + TimeSeriesMulticlassClassificationPipeline, + "Time Series Baseline Multiclass Pipeline", + ), + ProblemTypes.TIME_SERIES_BINARY: ( + TimeSeriesBinaryClassificationPipeline, + "Time Series Baseline Binary Pipeline", + ), + }[self.problem_type] + date_index = self.problem_configuration["date_index"] + gap = self.problem_configuration["gap"] + max_delay = self.problem_configuration["max_delay"] + baseline = pipeline_class( + component_graph=["Time Series Baseline Estimator"], + custom_name=pipeline_name, + parameters={ + "pipeline": { + "date_index": date_index, + "gap": gap, + "max_delay": max_delay, + }, + "Time Series Baseline Estimator": { + "date_index": date_index, + "gap": gap, + "max_delay": max_delay, + }, + }, + ) return baseline def _add_baseline_pipelines(self): @@ -759,9 +997,15 @@ def _add_baseline_pipelines(self): baseline = self._get_baseline_pipeline() self._pre_evaluation_callback(baseline) logger.info(f"Evaluating Baseline Pipeline: {baseline.name}") - computation = self._engine.submit_evaluation_job(self.automl_config, baseline, self.X_train, self.y_train) + computation = self._engine.submit_evaluation_job( + self.automl_config, baseline, self.X_train, self.y_train + ) evaluation = computation.get_result() - data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger") + data, pipeline, job_log = ( + evaluation.get("scores"), + evaluation.get("pipeline"), + evaluation.get("logger"), + ) self._post_evaluation_callback(pipeline, data, job_log) @staticmethod @@ -769,25 +1013,29 @@ def _get_mean_cv_scores_for_all_objectives(cv_data, objective_name_to_class): scores = defaultdict(int) n_folds = len(cv_data) for fold_data in cv_data: - for field, value in fold_data['all_objective_scores'].items(): + for field, value in fold_data["all_objective_scores"].items(): # The 'all_objective_scores' field contains scores for all objectives # but also fields like "# Training" and "# Testing", so we want to exclude them since # they are not scores if field in objective_name_to_class: scores[field] += value - return {objective: float(score) / n_folds for objective, score in scores.items()} + return { + objective: float(score) / n_folds for objective, score in scores.items() + } def _post_evaluation_callback(self, pipeline, evaluation_results, job_log): job_log.write_to_logger(logger) - training_time = evaluation_results['training_time'] - cv_data = evaluation_results['cv_data'] - cv_scores = evaluation_results['cv_scores'] + training_time = evaluation_results["training_time"] + cv_data = evaluation_results["cv_data"] + cv_scores = evaluation_results["cv_scores"] is_baseline = pipeline.model_family == ModelFamily.BASELINE cv_score = cv_scores.mean() cv_sd = cv_scores.std() percent_better_than_baseline = {} - mean_cv_all_objectives = self._get_mean_cv_scores_for_all_objectives(cv_data, self.objective_name_to_class) + mean_cv_all_objectives = self._get_mean_cv_scores_for_all_objectives( + cv_data, self.objective_name_to_class + ) if is_baseline: self._baseline_cv_scores = mean_cv_all_objectives for obj_name in mean_cv_all_objectives: @@ -795,14 +1043,16 @@ def _post_evaluation_callback(self, pipeline, evaluation_results, job_log): # In the event add_to_rankings is called before search _baseline_cv_scores will be empty so we will return # nan for the base score. - percent_better = objective_class.calculate_percent_difference(mean_cv_all_objectives[obj_name], - self._baseline_cv_scores.get(obj_name, np.nan)) + percent_better = objective_class.calculate_percent_difference( + mean_cv_all_objectives[obj_name], + self._baseline_cv_scores.get(obj_name, np.nan), + ) percent_better_than_baseline[obj_name] = percent_better high_variance_cv = self._check_for_high_variance(pipeline, cv_score, cv_sd) - pipeline_id = len(self._results['pipeline_results']) - self._results['pipeline_results'][pipeline_id] = { + pipeline_id = len(self._results["pipeline_results"]) + self._results["pipeline_results"][pipeline_id] = { "id": pipeline_id, "pipeline_name": pipeline.name, "pipeline_class": pipeline.__class__, @@ -814,21 +1064,34 @@ def _post_evaluation_callback(self, pipeline, evaluation_results, job_log): "training_time": training_time, "cv_data": cv_data, "percent_better_than_baseline_all_objectives": percent_better_than_baseline, - "percent_better_than_baseline": percent_better_than_baseline[self.objective.name], - "validation_score": cv_scores[0] + "percent_better_than_baseline": percent_better_than_baseline[ + self.objective.name + ], + "validation_score": cv_scores[0], } self._pipelines_searched.update({pipeline_id: pipeline.clone()}) if pipeline.model_family == ModelFamily.ENSEMBLE: - input_pipeline_ids = [self._automl_algorithm._best_pipeline_info[model_family]["id"] for model_family in self._automl_algorithm._best_pipeline_info] - self._results['pipeline_results'][pipeline_id]["input_pipeline_ids"] = input_pipeline_ids + input_pipeline_ids = [ + self._automl_algorithm._best_pipeline_info[model_family]["id"] + for model_family in self._automl_algorithm._best_pipeline_info + ] + self._results["pipeline_results"][pipeline_id][ + "input_pipeline_ids" + ] = input_pipeline_ids - self._results['search_order'].append(pipeline_id) + self._results["search_order"].append(pipeline_id) if not is_baseline: - score_to_minimize = -cv_score if self.objective.greater_is_better else cv_score + score_to_minimize = ( + -cv_score if self.objective.greater_is_better else cv_score + ) try: - self._automl_algorithm.add_result(score_to_minimize, pipeline, self._results['pipeline_results'][pipeline_id]) + self._automl_algorithm.add_result( + score_to_minimize, + pipeline, + self._results["pipeline_results"][pipeline_id], + ) except PipelineNotFoundError: pass @@ -836,7 +1099,9 @@ def _post_evaluation_callback(self, pipeline, evaluation_results, job_log): self.search_iteration_plot.update() if self.add_result_callback: - self.add_result_callback(self._results['pipeline_results'][pipeline_id], pipeline, self) + self.add_result_callback( + self._results["pipeline_results"][pipeline_id], pipeline, self + ) return pipeline_id def _check_for_high_variance(self, pipeline, cv_mean, cv_std, threshold=0.2): @@ -847,7 +1112,9 @@ def _check_for_high_variance(self, pipeline, cv_mean, cv_std, threshold=0.2): if cv_std != 0 and cv_mean != 0: high_variance_cv = bool(abs(cv_std / cv_mean) > threshold) if high_variance_cv: - logger.warning(f"\tHigh coefficient of variation (cv >= {threshold}) within cross validation scores.\n\t{pipeline_name} may not perform as estimated on unseen data.") + logger.warning( + f"\tHigh coefficient of variation (cv >= {threshold}) within cross validation scores.\n\t{pipeline_name} may not perform as estimated on unseen data." + ) return high_variance_cv def get_pipeline(self, pipeline_id): @@ -860,13 +1127,15 @@ def get_pipeline(self, pipeline_id): Returns: PipelineBase: untrained pipeline instance associated with the provided ID """ - pipeline_results = self.results['pipeline_results'].get(pipeline_id) + pipeline_results = self.results["pipeline_results"].get(pipeline_id) if pipeline_results is None: raise PipelineNotFoundError("Pipeline not found in automl results") pipeline = self._pipelines_searched.get(pipeline_id) - parameters = pipeline_results.get('parameters') + parameters = pipeline_results.get("parameters") if pipeline is None or parameters is None: - raise PipelineNotFoundError("Pipeline class or parameters not found in automl results") + raise PipelineNotFoundError( + "Pipeline class or parameters not found in automl results" + ) return pipeline.new(parameters, random_seed=self.random_seed) def describe_pipeline(self, pipeline_id, return_dict=False): @@ -881,43 +1150,65 @@ def describe_pipeline(self, pipeline_id, return_dict=False): Description of specified pipeline. Includes information such as type of pipeline components, problem, training time, cross validation, etc. """ - if pipeline_id not in self._results['pipeline_results']: + if pipeline_id not in self._results["pipeline_results"]: raise PipelineNotFoundError("Pipeline not found") pipeline = self.get_pipeline(pipeline_id) - pipeline_results = self._results['pipeline_results'][pipeline_id] + pipeline_results = self._results["pipeline_results"][pipeline_id] pipeline.describe() if pipeline.model_family == ModelFamily.ENSEMBLE: - logger.info("Input for ensembler are pipelines with IDs: " + str(pipeline_results['input_pipeline_ids'])) + logger.info( + "Input for ensembler are pipelines with IDs: " + + str(pipeline_results["input_pipeline_ids"]) + ) log_subtitle(logger, "Training") logger.info("Training for {} problems.".format(pipeline.problem_type)) - if self.optimize_thresholds and self.objective.is_defined_for_problem_type(ProblemTypes.BINARY) and self.objective.can_optimize_threshold: - logger.info("Objective to optimize binary classification pipeline thresholds for: {}".format(self.objective)) - - logger.info("Total training time (including CV): %.1f seconds" % pipeline_results["training_time"]) + if ( + self.optimize_thresholds + and self.objective.is_defined_for_problem_type(ProblemTypes.BINARY) + and self.objective.can_optimize_threshold + ): + logger.info( + "Objective to optimize binary classification pipeline thresholds for: {}".format( + self.objective + ) + ) + + logger.info( + "Total training time (including CV): %.1f seconds" + % pipeline_results["training_time"] + ) log_subtitle(logger, "Cross Validation", underline="-") - all_objective_scores = [fold["all_objective_scores"] for fold in pipeline_results["cv_data"]] + all_objective_scores = [ + fold["all_objective_scores"] for fold in pipeline_results["cv_data"] + ] all_objective_scores = pd.DataFrame(all_objective_scores) for c in all_objective_scores: if c in ["# Training", "# Validation"]: - all_objective_scores[c] = all_objective_scores[c].map(lambda x: '{:2,.0f}'.format(x) if not pd.isna(x) else np.nan) + all_objective_scores[c] = all_objective_scores[c].map( + lambda x: "{:2,.0f}".format(x) if not pd.isna(x) else np.nan + ) continue mean = all_objective_scores[c].mean(axis=0) std = all_objective_scores[c].std(axis=0) all_objective_scores.loc["mean", c] = mean all_objective_scores.loc["std", c] = std - all_objective_scores.loc["coef of var", c] = std / mean if abs(mean) > 0 else np.inf + all_objective_scores.loc["coef of var", c] = ( + std / mean if abs(mean) > 0 else np.inf + ) all_objective_scores = all_objective_scores.fillna("-") - with pd.option_context('display.float_format', '{:.3f}'.format, 'expand_frame_repr', False): + with pd.option_context( + "display.float_format", "{:.3f}".format, "expand_frame_repr", False + ): logger.info(all_objective_scores) if return_dict: @@ -929,13 +1220,21 @@ def add_to_rankings(self, pipeline): Arguments: pipeline (PipelineBase): pipeline to train and evaluate. """ - pipeline_rows = self.full_rankings[self.full_rankings['pipeline_name'] == pipeline.name] - for parameter in pipeline_rows['parameters']: + pipeline_rows = self.full_rankings[ + self.full_rankings["pipeline_name"] == pipeline.name + ] + for parameter in pipeline_rows["parameters"]: if pipeline.parameters == parameter: return - computation = self._engine.submit_evaluation_job(self.automl_config, pipeline, self.X_train, self.y_train) + computation = self._engine.submit_evaluation_job( + self.automl_config, pipeline, self.X_train, self.y_train + ) evaluation = computation.get_result() - data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger") + data, pipeline, job_log = ( + evaluation.get("scores"), + evaluation.get("pipeline"), + evaluation.get("logger"), + ) self._post_evaluation_callback(pipeline, data, job_log) self._find_best_pipeline() @@ -943,9 +1242,9 @@ def add_to_rankings(self, pipeline): def results(self): """Class that allows access to a copy of the results from `automl_search`. - Returns: dict containing `pipeline_results`: a dict with results from each pipeline, - and `search_order`: a list describing the order the pipelines were searched. - """ + Returns: dict containing `pipeline_results`: a dict with results from each pipeline, + and `search_order`: a list describing the order the pipelines were searched. + """ return copy.deepcopy(self._results) @property @@ -960,12 +1259,20 @@ def full_rankings(self): if self.objective.greater_is_better: ascending = False - full_rankings_cols = ["id", "pipeline_name", "mean_cv_score", "standard_deviation_cv_score", - "validation_score", "percent_better_than_baseline", "high_variance_cv", "parameters"] - if not self._results['pipeline_results']: + full_rankings_cols = [ + "id", + "pipeline_name", + "mean_cv_score", + "standard_deviation_cv_score", + "validation_score", + "percent_better_than_baseline", + "high_variance_cv", + "parameters", + ] + if not self._results["pipeline_results"]: return pd.DataFrame(columns=full_rankings_cols) - rankings_df = pd.DataFrame(self._results['pipeline_results'].values()) + rankings_df = pd.DataFrame(self._results["pipeline_results"].values()) rankings_df = rankings_df[full_rankings_cols] rankings_df.sort_values("mean_cv_score", ascending=ascending, inplace=True) rankings_df.reset_index(drop=True, inplace=True) @@ -979,7 +1286,9 @@ def best_pipeline(self): PipelineBase: A trained instance of the best pipeline and parameters found during automl search. If `train_best_pipeline` is set to False, returns an untrained pipeline instance. """ if not self._best_pipeline: - raise PipelineNotFoundError("automl search must be run before selecting `best_pipeline`.") + raise PipelineNotFoundError( + "automl search must be run before selecting `best_pipeline`." + ) return self._best_pipeline @@ -993,7 +1302,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): Returns: None """ - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: cloudpickle.dump(self, f, protocol=pickle_protocol) @staticmethod @@ -1006,7 +1315,7 @@ def load(file_path): Returns: AutoSearchBase object """ - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return cloudpickle.load(f) def train_pipelines(self, pipelines): @@ -1029,7 +1338,11 @@ def train_pipelines(self, pipelines): y_train = self.y_train for pipeline in pipelines: - computations.append(self._engine.submit_training_job(self.automl_config, pipeline, X_train, y_train)) + computations.append( + self._engine.submit_training_job( + self.automl_config, pipeline, X_train, y_train + ) + ) while computations: computation = computations.pop(0) @@ -1038,7 +1351,7 @@ def train_pipelines(self, pipelines): fitted_pipeline = computation.get_result() fitted_pipelines[fitted_pipeline.name] = fitted_pipeline except Exception as e: - logger.error(f'Train error for {pipeline.name}: {str(e)}') + logger.error(f"Train error for {pipeline.name}: {str(e)}") tb = traceback.format_tb(sys.exc_info()[2]) logger.error("Traceback:") logger.error("\n".join(tb)) @@ -1061,14 +1374,20 @@ def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): Note that the any pipelines that error out during scoring will not be included in the dictionary but the exception and stacktrace will be displayed in the log. """ - X_holdout, y_holdout = infer_feature_types(X_holdout), infer_feature_types(y_holdout) + X_holdout, y_holdout = infer_feature_types(X_holdout), infer_feature_types( + y_holdout + ) check_all_pipeline_names_unique(pipelines) scores = {} objectives = [get_objective(o, return_instance=True) for o in objectives] computations = [] for pipeline in pipelines: - computations.append(self._engine.submit_scoring_job(self.automl_config, pipeline, X_holdout, y_holdout, objectives)) + computations.append( + self._engine.submit_scoring_job( + self.automl_config, pipeline, X_holdout, y_holdout, objectives + ) + ) while computations: computation = computations.pop(0) @@ -1087,7 +1406,9 @@ def score_pipelines(self, pipelines, X_holdout, y_holdout, objectives): tb = traceback.format_tb(sys.exc_info()[2]) logger.error("Traceback:") logger.error("\n".join(tb)) - scores[pipeline_name] = {objective.name: np.nan for objective in objectives} + scores[pipeline_name] = { + objective.name: np.nan for objective in objectives + } else: computations.append(computation) return scores diff --git a/evalml/automl/callbacks.py b/evalml/automl/callbacks.py index d8d9a5e275..1e2786acec 100644 --- a/evalml/automl/callbacks.py +++ b/evalml/automl/callbacks.py @@ -10,24 +10,32 @@ def silent_error_callback(exception, traceback, automl, **kwargs): def raise_error_callback(exception, traceback, automl, **kwargs): """Raises the exception thrown by the AutoMLSearch object. Also logs the exception as an error.""" - logger.error(f'AutoML search raised a fatal exception: {str(exception)}') + logger.error(f"AutoML search raised a fatal exception: {str(exception)}") logger.error("\n".join(traceback)) raise exception def log_error_callback(exception, traceback, automl, **kwargs): """Logs the exception thrown as an error. Will not throw. This is the default behavior for AutoMLSearch.""" - fold_num = kwargs.get('fold_num') - pipeline = kwargs.get('pipeline') + fold_num = kwargs.get("fold_num") + pipeline = kwargs.get("pipeline") trace = "\n".join(traceback) if traceback else "" if isinstance(exception, PipelineScoreError): - logger.info(f"\t\t\tFold {fold_num}: Encountered an error scoring the following objectives: {', '.join(exception.exceptions)}.") - logger.info(f"\t\t\tFold {fold_num}: The scores for these objectives will be replaced with nan.") + logger.info( + f"\t\t\tFold {fold_num}: Encountered an error scoring the following objectives: {', '.join(exception.exceptions)}." + ) + logger.info( + f"\t\t\tFold {fold_num}: The scores for these objectives will be replaced with nan." + ) trace += f"\n{exception.message}" else: logger.info(f"\t\t\tFold {fold_num}: Encountered an error.") logger.info(f"\t\t\tFold {fold_num}: All scores will be replaced with nan.") - logger.info(f"\t\t\tFold {fold_num}: Please check {logger.handlers[1].baseFilename} for the current hyperparameters and stack trace.") - logger.info(f"\t\t\tFold {fold_num}: Exception during automl search: {str(exception)}") + logger.info( + f"\t\t\tFold {fold_num}: Please check {logger.handlers[1].baseFilename} for the current hyperparameters and stack trace." + ) + logger.info( + f"\t\t\tFold {fold_num}: Exception during automl search: {str(exception)}" + ) logger.debug(f"\t\t\tFold {fold_num}: Hyperparameters:\n\t{pipeline.parameters}") logger.debug(f"\t\t\tFold {fold_num}: Traceback:\n{trace}") diff --git a/evalml/automl/engine/__init__.py b/evalml/automl/engine/__init__.py index b58e36a20b..a1a8cec1c5 100644 --- a/evalml/automl/engine/__init__.py +++ b/evalml/automl/engine/__init__.py @@ -1,3 +1,9 @@ -from .engine_base import EngineBase, EngineComputation, train_pipeline, train_and_score_pipeline, evaluate_pipeline +from .engine_base import ( + EngineBase, + EngineComputation, + train_pipeline, + train_and_score_pipeline, + evaluate_pipeline, +) from .sequential_engine import SequentialEngine from .dask_engine import DaskEngine diff --git a/evalml/automl/engine/dask_engine.py b/evalml/automl/engine/dask_engine.py index b39ce080e3..ad7a192f6a 100644 --- a/evalml/automl/engine/dask_engine.py +++ b/evalml/automl/engine/dask_engine.py @@ -6,7 +6,7 @@ EngineComputation, evaluate_pipeline, score_pipeline, - train_pipeline + train_pipeline, ) @@ -44,7 +44,9 @@ class DaskEngine(EngineBase): def __init__(self, client): if not isinstance(client, Client): - raise TypeError(f"Expected dask.distributed.Client, received {type(client)}") + raise TypeError( + f"Expected dask.distributed.Client, received {type(client)}" + ) self.client = client self._data_futures_cache = {} @@ -65,7 +67,9 @@ def send_data_to_cluster(self, X, y): X_future, y_future = self._data_futures_cache[data_hash] if not (X_future.cancelled() or y_future.cancelled()): return X_future, y_future - self._data_futures_cache[data_hash] = self.client.scatter([X, y], broadcast=True) + self._data_futures_cache[data_hash] = self.client.scatter( + [X, y], broadcast=True + ) return self._data_futures_cache[data_hash] def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputation: @@ -82,11 +86,14 @@ def submit_evaluation_job(self, automl_config, pipeline, X, y) -> EngineComputat """ logger = self.setup_job_log() X, y = self.send_data_to_cluster(X, y) - dask_future = self.client.submit(evaluate_pipeline, pipeline=pipeline, - automl_config=automl_config, - X=X, - y=y, - logger=logger) + dask_future = self.client.submit( + evaluate_pipeline, + pipeline=pipeline, + automl_config=automl_config, + X=X, + y=y, + logger=logger, + ) return DaskComputation(dask_future) def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputation: @@ -102,16 +109,21 @@ def submit_training_job(self, automl_config, pipeline, X, y) -> EngineComputatio occurring in the dask cluster """ X, y = self.send_data_to_cluster(X, y) - dask_future = self.client.submit(train_pipeline, - pipeline=pipeline, X=X, - y=y, - optimize_thresholds=automl_config.optimize_thresholds, - objective=automl_config.objective, - X_schema=automl_config.X_schema, - y_schema=automl_config.y_schema) + dask_future = self.client.submit( + train_pipeline, + pipeline=pipeline, + X=X, + y=y, + optimize_thresholds=automl_config.optimize_thresholds, + objective=automl_config.objective, + X_schema=automl_config.X_schema, + y_schema=automl_config.y_schema, + ) return DaskComputation(dask_future) - def submit_scoring_job(self, automl_config, pipeline, X, y, objectives) -> EngineComputation: + def submit_scoring_job( + self, automl_config, pipeline, X, y, objectives + ) -> EngineComputation: """Send scoring job to cluster. Args: @@ -127,9 +139,15 @@ def submit_scoring_job(self, automl_config, pipeline, X, y, objectives) -> Engin X_schema = X.ww.schema y_schema = y.ww.schema X, y = self.send_data_to_cluster(X, y) - dask_future = self.client.submit(score_pipeline, pipeline=pipeline, - X=X, y=y, objectives=objectives, - X_schema=X_schema, y_schema=y_schema) + dask_future = self.client.submit( + score_pipeline, + pipeline=pipeline, + X=X, + y=y, + objectives=objectives, + X_schema=X_schema, + y_schema=y_schema, + ) computation = DaskComputation(dask_future) computation.meta_data["pipeline_name"] = pipeline.name return computation diff --git a/evalml/automl/engine/engine_base.py b/evalml/automl/engine/engine_base.py index 21056acc1d..93d65ab5f2 100644 --- a/evalml/automl/engine/engine_base.py +++ b/evalml/automl/engine/engine_base.py @@ -63,17 +63,18 @@ def error(self, msg): def write_to_logger(self, logger): """Write all the messages to the logger. First In First Out order.""" - logger_method = {"info": logger.info, - "debug": logger.debug, - "warning": logger.warning, - "error": logger.warning} + logger_method = { + "info": logger.info, + "debug": logger.debug, + "warning": logger.warning, + "error": logger.warning, + } for level, message in self.logs: method = logger_method[level] method(message) class EngineBase(ABC): - @staticmethod def setup_job_log(): return JobLogger() @@ -91,7 +92,9 @@ def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): """Submit job for pipeline scoring.""" -def train_pipeline(pipeline, X, y, optimize_thresholds, objective, X_schema=None, y_schema=None): +def train_pipeline( + pipeline, X, y, optimize_thresholds, objective, X_schema=None, y_schema=None +): """Train a pipeline and tune the threshold if necessary. Arguments: @@ -113,16 +116,24 @@ def train_pipeline(pipeline, X, y, optimize_thresholds, objective, X_schema=None if y_schema: y.ww.init(schema=y_schema) if optimize_thresholds and pipeline.can_tune_threshold_with_objective(objective): - X, X_threshold_tuning, y, y_threshold_tuning = split_data(X, y, pipeline.problem_type, - test_size=0.2, random_seed=pipeline.random_seed) + X, X_threshold_tuning, y, y_threshold_tuning = split_data( + X, y, pipeline.problem_type, test_size=0.2, random_seed=pipeline.random_seed + ) cv_pipeline = pipeline.clone() cv_pipeline.fit(X, y) - tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type, - X_threshold_tuning, y_threshold_tuning) + tune_binary_threshold( + cv_pipeline, + objective, + cv_pipeline.problem_type, + X_threshold_tuning, + y_threshold_tuning, + ) return cv_pipeline -def train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger): +def train_and_score_pipeline( + pipeline, automl_config, full_X_train, full_y_train, logger +): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores Arguments: @@ -140,48 +151,97 @@ def train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train logger.info("\tStarting cross validation") # Encode target for classification problems so that we can support float targets. This is okay because we only use split to get the indices to split on if is_classification(automl_config.problem_type): - y_mapping = {original_target: encoded_target for (encoded_target, original_target) in - enumerate(full_y_train.value_counts().index)} + y_mapping = { + original_target: encoded_target + for (encoded_target, original_target) in enumerate( + full_y_train.value_counts().index + ) + } full_y_train = ww.init_series(full_y_train.map(y_mapping)) cv_pipeline = pipeline - for i, (train, valid) in enumerate(automl_config.data_splitter.split(full_X_train, full_y_train)): + for i, (train, valid) in enumerate( + automl_config.data_splitter.split(full_X_train, full_y_train) + ): if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0: # Stacked ensembles do CV internally, so we do not run CV here for performance reasons. - logger.debug(f"Skipping fold {i} because CV for stacked ensembles is not supported.") + logger.debug( + f"Skipping fold {i} because CV for stacked ensembles is not supported." + ) break logger.debug(f"\t\tTraining and scoring on fold {i}") X_train, X_valid = full_X_train.ww.iloc[train], full_X_train.ww.iloc[valid] y_train, y_valid = full_y_train.ww.iloc[train], full_y_train.ww.iloc[valid] - if is_binary(automl_config.problem_type) or is_multiclass(automl_config.problem_type): + if is_binary(automl_config.problem_type) or is_multiclass( + automl_config.problem_type + ): diff_train = set(np.setdiff1d(full_y_train, y_train)) diff_valid = set(np.setdiff1d(full_y_train, y_valid)) - diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else "" - diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else "" + diff_string = ( + f"Missing target values in the training set after data split: {diff_train}. " + if diff_train + else "" + ) + diff_string += ( + f"Missing target values in the validation set after data split: {diff_valid}." + if diff_valid + else "" + ) if diff_string: raise Exception(diff_string) - objectives_to_score = [automl_config.objective] + automl_config.additional_objectives + objectives_to_score = [ + automl_config.objective + ] + automl_config.additional_objectives try: logger.debug(f"\t\t\tFold {i}: starting training") - cv_pipeline = train_pipeline(pipeline, X_train, y_train, automl_config.optimize_thresholds, automl_config.objective) + cv_pipeline = train_pipeline( + pipeline, + X_train, + y_train, + automl_config.optimize_thresholds, + automl_config.objective, + ) logger.debug(f"\t\t\tFold {i}: finished training") - if automl_config.optimize_thresholds and pipeline.can_tune_threshold_with_objective(automl_config.objective): - logger.debug(f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})") + if ( + automl_config.optimize_thresholds + and pipeline.can_tune_threshold_with_objective(automl_config.objective) + ): + logger.debug( + f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})" + ) logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline") scores = cv_pipeline.score(X_valid, y_valid, objectives=objectives_to_score) - logger.debug(f"\t\t\tFold {i}: {automl_config.objective.name} score: {scores[automl_config.objective.name]:.3f}") + logger.debug( + f"\t\t\tFold {i}: {automl_config.objective.name} score: {scores[automl_config.objective.name]:.3f}" + ) score = scores[automl_config.objective.name] except Exception as e: if automl_config.error_callback is not None: - automl_config.error_callback(exception=e, traceback=traceback.format_tb(sys.exc_info()[2]), automl=automl_config, - fold_num=i, pipeline=pipeline) + automl_config.error_callback( + exception=e, + traceback=traceback.format_tb(sys.exc_info()[2]), + automl=automl_config, + fold_num=i, + pipeline=pipeline, + ) if isinstance(e, PipelineScoreError): nan_scores = {objective: np.nan for objective in e.exceptions} scores = {**nan_scores, **e.scored_successfully} - scores = OrderedDict({o.name: scores[o.name] for o in [automl_config.objective] + automl_config.additional_objectives}) + scores = OrderedDict( + { + o.name: scores[o.name] + for o in [automl_config.objective] + + automl_config.additional_objectives + } + ) score = scores[automl_config.objective.name] else: score = np.nan - scores = OrderedDict(zip([n.name for n in automl_config.additional_objectives], [np.nan] * len(automl_config.additional_objectives))) + scores = OrderedDict( + zip( + [n.name for n in automl_config.additional_objectives], + [np.nan] * len(automl_config.additional_objectives), + ) + ) ordered_scores = OrderedDict() ordered_scores.update({automl_config.objective.name: score}) @@ -189,17 +249,34 @@ def train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train ordered_scores.update({"# Training": y_train.shape[0]}) ordered_scores.update({"# Validation": y_valid.shape[0]}) - evaluation_entry = {"all_objective_scores": ordered_scores, "mean_cv_score": score, 'binary_classification_threshold': None} - if is_binary(automl_config.problem_type) and cv_pipeline is not None and cv_pipeline.threshold is not None: - evaluation_entry['binary_classification_threshold'] = cv_pipeline.threshold + evaluation_entry = { + "all_objective_scores": ordered_scores, + "mean_cv_score": score, + "binary_classification_threshold": None, + } + if ( + is_binary(automl_config.problem_type) + and cv_pipeline is not None + and cv_pipeline.threshold is not None + ): + evaluation_entry["binary_classification_threshold"] = cv_pipeline.threshold cv_data.append(evaluation_entry) training_time = time.time() - start cv_scores = pd.Series([fold["mean_cv_score"] for fold in cv_data]) cv_score_mean = cv_scores.mean() - logger.info(f"\tFinished cross validation - mean {automl_config.objective.name}: {cv_score_mean:.3f}") - return {"scores": {'cv_data': cv_data, 'training_time': training_time, 'cv_scores': cv_scores, 'cv_score_mean': cv_score_mean}, - "pipeline": cv_pipeline, - "logger": logger} + logger.info( + f"\tFinished cross validation - mean {automl_config.objective.name}: {cv_score_mean:.3f}" + ) + return { + "scores": { + "cv_data": cv_data, + "training_time": training_time, + "cv_scores": cv_scores, + "cv_score_mean": cv_score_mean, + }, + "pipeline": cv_pipeline, + "logger": logger, + } def evaluate_pipeline(pipeline, automl_config, X, y, logger): @@ -220,9 +297,13 @@ def evaluate_pipeline(pipeline, automl_config, X, y, logger): X.ww.init(schema=automl_config.X_schema) y.ww.init(schema=automl_config.y_schema) - return train_and_score_pipeline(pipeline, automl_config=automl_config, - full_X_train=X, full_y_train=y, - logger=logger) + return train_and_score_pipeline( + pipeline, + automl_config=automl_config, + full_X_train=X, + full_y_train=y, + logger=logger, + ) def score_pipeline(pipeline, X, y, objectives, X_schema=None, y_schema=None): diff --git a/evalml/automl/engine/sequential_engine.py b/evalml/automl/engine/sequential_engine.py index 461e359617..822218ae5c 100644 --- a/evalml/automl/engine/sequential_engine.py +++ b/evalml/automl/engine/sequential_engine.py @@ -3,7 +3,7 @@ EngineComputation, evaluate_pipeline, score_pipeline, - train_pipeline + train_pipeline, ) from evalml.objectives.utils import get_objective @@ -44,26 +44,37 @@ class SequentialEngine(EngineBase): def submit_evaluation_job(self, automl_config, pipeline, X, y): logger = self.setup_job_log() - return SequentialComputation(work=evaluate_pipeline, - pipeline=pipeline, - automl_config=automl_config, X=X, - y=y, - logger=logger) + return SequentialComputation( + work=evaluate_pipeline, + pipeline=pipeline, + automl_config=automl_config, + X=X, + y=y, + logger=logger, + ) def submit_training_job(self, automl_config, pipeline, X, y): - return SequentialComputation(work=train_pipeline, - pipeline=pipeline, X=X, - y=y, - optimize_thresholds=automl_config.optimize_thresholds, - objective=automl_config.objective, - X_schema=automl_config.X_schema, - y_schema=automl_config.y_schema) + return SequentialComputation( + work=train_pipeline, + pipeline=pipeline, + X=X, + y=y, + optimize_thresholds=automl_config.optimize_thresholds, + objective=automl_config.objective, + X_schema=automl_config.X_schema, + y_schema=automl_config.y_schema, + ) def submit_scoring_job(self, automl_config, pipeline, X, y, objectives): objectives = [get_objective(o, return_instance=True) for o in objectives] - computation = SequentialComputation(work=score_pipeline, - pipeline=pipeline, - X=X, y=y, objectives=objectives, - X_schema=X.ww.schema, y_schema=y.ww.schema) + computation = SequentialComputation( + work=score_pipeline, + pipeline=pipeline, + X=X, + y=y, + objectives=objectives, + X_schema=X.ww.schema, + y_schema=y.ww.schema, + ) computation.meta_data["pipeline_name"] = pipeline.name return computation diff --git a/evalml/automl/pipeline_search_plots.py b/evalml/automl/pipeline_search_plots.py index 566f60627a..2d490ebd59 100644 --- a/evalml/automl/pipeline_search_plots.py +++ b/evalml/automl/pipeline_search_plots.py @@ -1,9 +1,12 @@ from evalml.utils import import_or_raise, jupyter_check -class SearchIterationPlot(): +class SearchIterationPlot: def __init__(self, data, show_plot=True): - self._go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + self._go = import_or_raise( + "plotly.graph_objects", + error_msg="Cannot find dependency plotly.graph_objects", + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) @@ -13,29 +16,31 @@ def __init__(self, data, show_plot=True): self.curr_iteration_scores = list() self.best_iteration_scores = list() - title = 'Pipeline Search: Iteration vs. {}
Gray marker indicates the score at current iteration'.format(self.data.objective.name) + title = "Pipeline Search: Iteration vs. {}
Gray marker indicates the score at current iteration".format( + self.data.objective.name + ) data = [ - self._go.Scatter(x=[], y=[], mode='lines+markers', name='Best Score'), - self._go.Scatter(x=[], y=[], mode='markers', name='Iter score', marker={'color': 'gray'}) + self._go.Scatter(x=[], y=[], mode="lines+markers", name="Best Score"), + self._go.Scatter( + x=[], y=[], mode="markers", name="Iter score", marker={"color": "gray"} + ), ] layout = { - 'title': title, - 'xaxis': { - 'title': 'Iteration', - 'rangemode': 'tozero' - }, - 'yaxis': { - 'title': 'Score' - } + "title": title, + "xaxis": {"title": "Iteration", "rangemode": "tozero"}, + "yaxis": {"title": "Score"}, } self.best_score_by_iter_fig = self._go.FigureWidget(data, layout) self.best_score_by_iter_fig.update_layout(showlegend=False) self.update() def update(self): - if len(self.data.results['search_order']) > 0 and len(self.data.results['pipeline_results']) > 0: - iter_idx = self.data.results['search_order'] - pipeline_res = self.data.results['pipeline_results'] + if ( + len(self.data.results["search_order"]) > 0 + and len(self.data.results["pipeline_results"]) > 0 + ): + iter_idx = self.data.results["search_order"] + pipeline_res = self.data.results["pipeline_results"] iter_scores = [pipeline_res[i]["mean_cv_score"] for i in iter_idx] iter_score_pairs = zip(iter_idx, iter_scores) @@ -50,8 +55,12 @@ def update(self): best_iteration_scores.append(score) curr_best = score else: - if self.data.objective.greater_is_better and score > curr_best \ - or not self.data.objective.greater_is_better and score < curr_best: + if ( + self.data.objective.greater_is_better + and score > curr_best + or not self.data.objective.greater_is_better + and score < curr_best + ): best_iteration_scores.append(score) curr_best = score else: @@ -68,8 +77,7 @@ def update(self): class PipelineSearchPlots: - """Plots for the AutoMLSearch class. - """ + """Plots for the AutoMLSearch class.""" def __init__(self, data): """Make plots for the AutoMLSearch class. @@ -77,7 +85,10 @@ def __init__(self, data): Arguments: data (AutoMLSearch): Automated pipeline search object """ - self._go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + self._go = import_or_raise( + "plotly.graph_objects", + error_msg="Cannot find dependency plotly.graph_objects", + ) self.data = data def search_iteration_plot(self, interactive_plot=False): @@ -90,7 +101,9 @@ def search_iteration_plot(self, interactive_plot=False): plot_obj = SearchIterationPlot(self.data) return self._go.Figure(plot_obj.best_score_by_iter_fig) try: - ipython_display = import_or_raise("IPython.display", error_msg="Cannot find dependency IPython.display") + ipython_display = import_or_raise( + "IPython.display", error_msg="Cannot find dependency IPython.display" + ) plot_obj = SearchIterationPlot(self.data) ipython_display.display(plot_obj.best_score_by_iter_fig) return plot_obj diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index e448ae3b93..3a29e53da6 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -6,15 +6,12 @@ from evalml.objectives import get_objective from evalml.pipelines import ComponentGraph -from evalml.preprocessing.data_splitters import ( - TimeSeriesSplit, - TrainingValidationSplit -) +from evalml.preprocessing.data_splitters import TimeSeriesSplit, TrainingValidationSplit from evalml.problem_types import ( ProblemTypes, handle_problem_types, is_binary, - is_time_series + is_time_series, ) from evalml.utils import import_or_raise @@ -33,16 +30,26 @@ def get_default_primary_search_objective(problem_type): ObjectiveBase: primary objective instance for the problem type. """ problem_type = handle_problem_types(problem_type) - objective_name = {'binary': 'Log Loss Binary', - 'multiclass': 'Log Loss Multiclass', - 'regression': 'R2', - 'time series regression': 'R2', - 'time series binary': 'Log Loss Binary', - 'time series multiclass': 'Log Loss Multiclass'}[problem_type.value] + objective_name = { + "binary": "Log Loss Binary", + "multiclass": "Log Loss Multiclass", + "regression": "R2", + "time series regression": "R2", + "time series binary": "Log Loss Binary", + "time series multiclass": "Log Loss Multiclass", + }[problem_type.value] return get_objective(objective_name, return_instance=True) -def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_seed=0): +def make_data_splitter( + X, + y, + problem_type, + problem_configuration=None, + n_splits=3, + shuffle=True, + random_seed=0, +): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. Arguments: @@ -62,18 +69,30 @@ def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits= problem_type = handle_problem_types(problem_type) if is_time_series(problem_type): if not problem_configuration: - raise ValueError("problem_configuration is required for time series problem types") - return TimeSeriesSplit(n_splits=n_splits, gap=problem_configuration.get('gap'), - max_delay=problem_configuration.get('max_delay'), date_index=problem_configuration.get('date_index')) + raise ValueError( + "problem_configuration is required for time series problem types" + ) + return TimeSeriesSplit( + n_splits=n_splits, + gap=problem_configuration.get("gap"), + max_delay=problem_configuration.get("max_delay"), + date_index=problem_configuration.get("date_index"), + ) if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD: - return TrainingValidationSplit(test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle) + return TrainingValidationSplit( + test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle + ) if problem_type == ProblemTypes.REGRESSION: return KFold(n_splits=n_splits, random_state=random_seed, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: - return StratifiedKFold(n_splits=n_splits, random_state=random_seed, shuffle=shuffle) + return StratifiedKFold( + n_splits=n_splits, random_state=random_seed, shuffle=shuffle + ) -def tune_binary_threshold(pipeline, objective, problem_type, X_threshold_tuning, y_threshold_tuning): +def tune_binary_threshold( + pipeline, objective, problem_type, X_threshold_tuning, y_threshold_tuning +): """Tunes the threshold of a binary pipeline to the X and y thresholding data Arguments: @@ -83,12 +102,18 @@ def tune_binary_threshold(pipeline, objective, problem_type, X_threshold_tuning, X_threshold_tuning (pd.DataFrame): Features to tune pipeline to. y_threshold_tuning (pd.Series): Target data to tune pipeline to. """ - if is_binary(problem_type) and objective.is_defined_for_problem_type(problem_type) and objective.can_optimize_threshold: + if ( + is_binary(problem_type) + and objective.is_defined_for_problem_type(problem_type) + and objective.can_optimize_threshold + ): pipeline.threshold = 0.5 if X_threshold_tuning is not None: y_predict_proba = pipeline.predict_proba(X_threshold_tuning) y_predict_proba = y_predict_proba.iloc[:, 1] - pipeline.optimize_threshold(X_threshold_tuning, y_threshold_tuning, y_predict_proba, objective) + pipeline.optimize_threshold( + X_threshold_tuning, y_threshold_tuning, y_predict_proba, objective + ) def check_all_pipeline_names_unique(pipelines): @@ -109,13 +134,25 @@ def check_all_pipeline_names_unique(pipelines): if duplicate_names: plural, tense = ("s", "were") if len(duplicate_names) > 1 else ("", "was") duplicates = ", ".join([f"'{name}'" for name in sorted(duplicate_names)]) - raise ValueError(f"All pipeline names must be unique. The name{plural} {duplicates} {tense} repeated.") - - -AutoMLConfig = namedtuple("AutoMLConfig", ["data_splitter", "problem_type", - "objective", "additional_objectives", "optimize_thresholds", - "error_callback", "random_seed", - "X_schema", "y_schema"]) + raise ValueError( + f"All pipeline names must be unique. The name{plural} {duplicates} {tense} repeated." + ) + + +AutoMLConfig = namedtuple( + "AutoMLConfig", + [ + "data_splitter", + "problem_type", + "objective", + "additional_objectives", + "optimize_thresholds", + "error_callback", + "random_seed", + "X_schema", + "y_schema", + ], +) def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): @@ -139,21 +176,23 @@ def get_best_sampler_for_data(X, y, sampler_method, sampler_balanced_ratio): if all(class_ratios >= sampler_balanced_ratio): return None # We set a threshold to use the Undersampler in order to avoid long runtimes - elif len(y) >= _SAMPLER_THRESHOLD and sampler_method != 'Oversampler': - return 'Undersampler' + elif len(y) >= _SAMPLER_THRESHOLD and sampler_method != "Oversampler": + return "Undersampler" else: try: - import_or_raise("imblearn.over_sampling", error_msg="imbalanced-learn is not installed") - cat_cols = X.ww.select('Categorical').columns + import_or_raise( + "imblearn.over_sampling", error_msg="imbalanced-learn is not installed" + ) + cat_cols = X.ww.select("Categorical").columns # Use different samplers depending on the number of categorical columns if len(cat_cols) == X.shape[1]: - return 'SMOTEN Oversampler' + return "SMOTEN Oversampler" elif not len(cat_cols): - return 'SMOTE Oversampler' + return "SMOTE Oversampler" else: - return 'SMOTENC Oversampler' + return "SMOTENC Oversampler" except ImportError: - return 'Undersampler' + return "Undersampler" def get_hyperparameter_ranges(component_graph, custom_hyperparameters): @@ -167,7 +206,9 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): Returns: dict: Dictionary of hyperparameter ranges for each component in the component graph. """ - linearized_component_graph = ComponentGraph.linearized_component_graph(component_graph) + linearized_component_graph = ComponentGraph.linearized_component_graph( + component_graph + ) hyperparameter_ranges = dict() for component_name, component_class in linearized_component_graph: component_hyperparameters = copy.copy(component_class.hyperparameter_ranges) diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index db880df814..23481a50e7 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -1,9 +1,8 @@ - from evalml.data_checks import ( DataCheck, DataCheckError, DataCheckMessageCode, - DataCheckWarning + DataCheckWarning, ) from evalml.utils import infer_feature_types @@ -25,13 +24,23 @@ def __init__(self, threshold=0.1, min_samples=100, num_cv_folds=3): num_cv_folds (int): The number of cross-validation folds. Must be positive. Choose 0 to ignore this warning. """ if threshold <= 0 or threshold > 0.5: - raise ValueError("Provided threshold {} is not within the range (0, 0.5]".format(threshold)) + raise ValueError( + "Provided threshold {} is not within the range (0, 0.5]".format( + threshold + ) + ) self.threshold = threshold if min_samples <= 0: - raise ValueError("Provided value min_samples {} is not greater than 0".format(min_samples)) + raise ValueError( + "Provided value min_samples {} is not greater than 0".format( + min_samples + ) + ) self.min_samples = min_samples if num_cv_folds < 0: - raise ValueError("Provided number of CV folds {} is less than 0".format(num_cv_folds)) + raise ValueError( + "Provided number of CV folds {} is less than 0".format(num_cv_folds) + ) self.cv_folds = num_cv_folds * 2 def validate(self, X, y): @@ -68,11 +77,7 @@ def validate(self, X, y): "details": {"target_values": [0]}}],\ "actions": []} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} y = infer_feature_types(y) @@ -84,27 +89,46 @@ def validate(self, X, y): if len(below_threshold_folds): below_threshold_values = below_threshold_folds.index.tolist() error_msg = "The number of instances of these targets is less than 2 * the number of cross folds = {} instances: {}" - DataCheck._add_message(DataCheckError(message=error_msg.format(self.cv_folds, below_threshold_values), - data_check_name=self.name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": below_threshold_values}), results) + DataCheck._add_message( + DataCheckError( + message=error_msg.format(self.cv_folds, below_threshold_values), + data_check_name=self.name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": below_threshold_values}, + ), + results, + ) counts = fold_counts / (fold_counts + fold_counts.values[0]) below_threshold = counts.where(counts < self.threshold).dropna() # if there are items that occur less than the threshold, add them to the list of results if len(below_threshold): below_threshold_values = below_threshold.index.tolist() warning_msg = "The following labels fall below {:.0f}% of the target: {}" - DataCheck._add_message(DataCheckWarning(message=warning_msg.format(self.threshold * 100, below_threshold_values), - data_check_name=self.name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": below_threshold_values}), results) + DataCheck._add_message( + DataCheckWarning( + message=warning_msg.format( + self.threshold * 100, below_threshold_values + ), + data_check_name=self.name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": below_threshold_values}, + ), + results, + ) sample_counts = fold_counts.where(fold_counts < self.min_samples).dropna() if len(below_threshold) and len(sample_counts): sample_count_values = sample_counts.index.tolist() severe_imbalance = [v for v in sample_count_values if v in below_threshold] warning_msg = "The following labels in the target have severe class imbalance because they fall under {:.0f}% of the target and have less than {} samples: {}" - DataCheck._add_message(DataCheckWarning(message=warning_msg.format(self.threshold * 100, self.min_samples, severe_imbalance), - data_check_name=self.name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": severe_imbalance}), results) + DataCheck._add_message( + DataCheckWarning( + message=warning_msg.format( + self.threshold * 100, self.min_samples, severe_imbalance + ), + data_check_name=self.name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": severe_imbalance}, + ), + results, + ) return results diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py index 0c46398ce0..b929c071e7 100644 --- a/evalml/data_checks/data_check_action.py +++ b/evalml/data_checks/data_check_action.py @@ -14,12 +14,8 @@ def __init__(self, action_code, metadata=None): def __eq__(self, other): """Checks for equality. Two DataCheckAction objs are considered equivalent if all of their attributes are equivalent.""" - return (self.action_code == other.action_code and - self.metadata == other.metadata) + return self.action_code == other.action_code and self.metadata == other.metadata def to_dict(self): - action_dict = { - "code": self.action_code.name, - "metadata": self.metadata - } + action_dict = {"code": self.action_code.name, "metadata": self.metadata} return action_dict diff --git a/evalml/data_checks/data_check_message.py b/evalml/data_checks/data_check_message.py index ec73ef3da8..1bbe11fa5e 100644 --- a/evalml/data_checks/data_check_message.py +++ b/evalml/data_checks/data_check_message.py @@ -27,11 +27,13 @@ def __str__(self): def __eq__(self, other): """Checks for equality. Two DataCheckMessage objs are considered equivalent if all of their attributes are equivalent.""" - return (self.message_type == other.message_type and - self.message == other.message and - self.data_check_name == other.data_check_name and - self.message_code == other.message_code and - self.details == other.details) + return ( + self.message_type == other.message_type + and self.message == other.message + and self.data_check_name == other.data_check_name + and self.message_code == other.message_code + and self.details == other.details + ) def to_dict(self): message_dict = { @@ -48,9 +50,11 @@ def to_dict(self): class DataCheckError(DataCheckMessage): """DataCheckMessage subclass for errors returned by data checks.""" + message_type = DataCheckMessageType.ERROR class DataCheckWarning(DataCheckMessage): """DataCheckMessage subclass for warnings returned by data checks.""" + message_type = DataCheckMessageType.WARNING diff --git a/evalml/data_checks/data_check_message_code.py b/evalml/data_checks/data_check_message_code.py index 4329cdf377..120b508db1 100644 --- a/evalml/data_checks/data_check_message_code.py +++ b/evalml/data_checks/data_check_message_code.py @@ -34,7 +34,9 @@ class DataCheckMessageCode(Enum): TARGET_BINARY_INVALID_VALUES = "target_binary_invalid_values" """Message code for target data for a binary classification problem with numerical values not equal to {0, 1}.""" - TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS = "target_multiclass_not_two_examples_per_class" + TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS = ( + "target_multiclass_not_two_examples_per_class" + ) """Message code for target data for a multi classification problem that does not have two examples per class.""" TARGET_MULTICLASS_NOT_ENOUGH_CLASSES = "target_multiclass_not_enough_classes" diff --git a/evalml/data_checks/data_check_message_type.py b/evalml/data_checks/data_check_message_type.py index aff39f3783..d88395273f 100644 --- a/evalml/data_checks/data_check_message_type.py +++ b/evalml/data_checks/data_check_message_type.py @@ -3,6 +3,7 @@ class DataCheckMessageType(Enum): """Enum for type of data check message: WARNING or ERROR.""" + WARNING = "warning" """Warning message returned by a data check.""" diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py index 5a07bd48e1..ad73bae1fb 100644 --- a/evalml/data_checks/data_checks.py +++ b/evalml/data_checks/data_checks.py @@ -9,7 +9,9 @@ def _has_defaults_for_all_args(init): """Tests whether the init method has defaults for all arguments.""" signature = inspect.getfullargspec(init) n_default_args = 0 if not signature.defaults else len(signature.defaults) - n_args = len(signature.args) - 1 if 'self' in signature.args else len(signature.args) + n_args = ( + len(signature.args) - 1 if "self" in signature.args else len(signature.args) + ) return n_args == n_default_args @@ -20,11 +22,18 @@ class DataChecks: def _validate_data_checks(data_check_classes, params): """Inits a DataChecks instance from a list of DataCheck classes and corresponding params.""" if not isinstance(data_check_classes, list): - raise ValueError(f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}.") - if not all(inspect.isclass(check) and issubclass(check, DataCheck) for check in data_check_classes): - raise ValueError("All elements of parameter data_checks must be an instance of DataCheck " - "or a DataCheck class with any desired parameters specified in the " - "data_check_params dictionary.") + raise ValueError( + f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}." + ) + if not all( + inspect.isclass(check) and issubclass(check, DataCheck) + for check in data_check_classes + ): + raise ValueError( + "All elements of parameter data_checks must be an instance of DataCheck " + "or a DataCheck class with any desired parameters specified in the " + "data_check_params dictionary." + ) params = params or dict() if not isinstance(params, dict): raise ValueError(f"Params must be a dictionary. Received {params}") @@ -37,12 +46,14 @@ def _validate_data_checks(data_check_classes, params): raise DataCheckInitError( f"Class {extraneous_class} was provided in params dictionary but it does not match any name " "in the data_check_classes list. Make sure every key of the params dictionary matches the name" - "attribute of a corresponding DataCheck class.") + "attribute of a corresponding DataCheck class." + ) for missing_class_name in missing: if not _has_defaults_for_all_args(name_to_class[missing_class_name]): raise DataCheckInitError( f"Class {missing_class_name} was provided in the data_checks_classes list but it does not have " - "an entry in the parameters dictionary.") + "an entry in the parameters dictionary." + ) @staticmethod def _init_data_checks(data_check_classes, params): @@ -51,12 +62,14 @@ def _init_data_checks(data_check_classes, params): class_params = params.get(data_check_class.name, {}) if not isinstance(class_params, dict): raise DataCheckInitError( - f"Parameters for {data_check_class.name} were not in a dictionary. Received {class_params}.") + f"Parameters for {data_check_class.name} were not in a dictionary. Received {class_params}." + ) try: data_check_instances.append(data_check_class(**class_params)) except TypeError as e: raise DataCheckInitError( - f"Encountered the following error while initializing {data_check_class.name}: {e}") + f"Encountered the following error while initializing {data_check_class.name}: {e}" + ) return data_check_instances def __init__(self, data_checks=None, data_check_params=None): @@ -83,13 +96,9 @@ def validate(self, X, y=None): dict: Dictionary containing DataCheckMessage objects """ - messages = { - "warnings": [], - "errors": [], - "actions": [] - } + messages = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) - X = X.ww.drop(list(X.ww.select('index').columns)) + X = X.ww.drop(list(X.ww.select("index").columns)) if y is not None: y = infer_feature_types(y) diff --git a/evalml/data_checks/datetime_nan_data_check.py b/evalml/data_checks/datetime_nan_data_check.py index 716871500f..90bbc2004b 100644 --- a/evalml/data_checks/datetime_nan_data_check.py +++ b/evalml/data_checks/datetime_nan_data_check.py @@ -8,8 +8,7 @@ class DateTimeNaNDataCheck(DataCheck): """Checks if datetime columns contain NaN values.""" def __init__(self): - """Checks each column in the input for datetime features and will issue an error if NaN values are present. - """ + """Checks each column in the input for datetime features and will issue an error if NaN values are present.""" def validate(self, X, y=None): """Checks if any datetime columns contain NaN values. @@ -37,20 +36,22 @@ def validate(self, X, y=None): ... message_code=DataCheckMessageCode.DATETIME_HAS_NAN, ... details={"columns": 'index'}).to_dict()]} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) datetime_cols = X.ww.select("datetime") nan_columns = datetime_cols.columns[datetime_cols.isna().any()].tolist() if len(nan_columns) > 0: nan_columns = [str(col) for col in nan_columns] - cols_str = ', '.join(nan_columns) if len(nan_columns) > 1 else nan_columns[0] - results["errors"].append(DataCheckError(message=error_contains_nan.format(cols_str), - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - details={"columns": cols_str}).to_dict()) + cols_str = ( + ", ".join(nan_columns) if len(nan_columns) > 1 else nan_columns[0] + ) + results["errors"].append( + DataCheckError( + message=error_contains_nan.format(cols_str), + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + details={"columns": cols_str}, + ).to_dict() + ) return results diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index 95b13974fd..d98b116e8b 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -27,9 +27,15 @@ class DefaultDataChecks(DataChecks): """ - _DEFAULT_DATA_CHECK_CLASSES = [HighlyNullDataCheck, IDColumnsDataCheck, - TargetLeakageDataCheck, InvalidTargetDataCheck, NoVarianceDataCheck, - NaturalLanguageNaNDataCheck, DateTimeNaNDataCheck] + _DEFAULT_DATA_CHECK_CLASSES = [ + HighlyNullDataCheck, + IDColumnsDataCheck, + TargetLeakageDataCheck, + InvalidTargetDataCheck, + NoVarianceDataCheck, + NaturalLanguageNaNDataCheck, + DateTimeNaNDataCheck, + ] def __init__(self, problem_type, objective, n_splits=3): """ @@ -40,12 +46,27 @@ def __init__(self, problem_type, objective, n_splits=3): objective (str or ObjectiveBase): Name or instance of the objective class. n_splits (int): The number of splits as determined by the data splitter being used. """ - if handle_problem_types(problem_type) in [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]: - super().__init__(self._DEFAULT_DATA_CHECK_CLASSES, - data_check_params={"InvalidTargetDataCheck": {"problem_type": problem_type, - "objective": objective}}) + if handle_problem_types(problem_type) in [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ]: + super().__init__( + self._DEFAULT_DATA_CHECK_CLASSES, + data_check_params={ + "InvalidTargetDataCheck": { + "problem_type": problem_type, + "objective": objective, + } + }, + ) else: - super().__init__(self._DEFAULT_DATA_CHECK_CLASSES + [ClassImbalanceDataCheck], - data_check_params={"InvalidTargetDataCheck": {"problem_type": problem_type, - "objective": objective}, - "ClassImbalanceDataCheck": {"num_cv_folds": n_splits}}) + super().__init__( + self._DEFAULT_DATA_CHECK_CLASSES + [ClassImbalanceDataCheck], + data_check_params={ + "InvalidTargetDataCheck": { + "problem_type": problem_type, + "objective": objective, + }, + "ClassImbalanceDataCheck": {"num_cv_folds": n_splits}, + }, + ) diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index e32a96acc7..3ef5abc1bf 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -3,7 +3,7 @@ DataCheckAction, DataCheckActionCode, DataCheckMessageCode, - DataCheckWarning + DataCheckWarning, ) from evalml.utils import infer_feature_types @@ -20,7 +20,9 @@ def __init__(self, pct_null_threshold=0.95): """ if pct_null_threshold < 0 or pct_null_threshold > 1: - raise ValueError("pct_null_threshold must be a float between 0 and 1, inclusive.") + raise ValueError( + "pct_null_threshold must be a float between 0 and 1, inclusive." + ) self.pct_null_threshold = pct_null_threshold def validate(self, X, y=None): @@ -66,34 +68,58 @@ def validate(self, X, y=None): "metadata": {"column": "lots_of_null"}}]} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) percent_null_rows = X.isnull().mean(axis=1) - highly_null_rows = percent_null_rows[percent_null_rows >= self.pct_null_threshold] + highly_null_rows = percent_null_rows[ + percent_null_rows >= self.pct_null_threshold + ] if len(highly_null_rows) > 0: warning_msg = f"{len(highly_null_rows)} out of {len(X)} rows are more than {self.pct_null_threshold*100}% null" - results["warnings"].append(DataCheckWarning(message=warning_msg, - data_check_name=self.name, - message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, - details={"pct_null_cols": highly_null_rows}).to_dict()) - results["actions"].append(DataCheckAction(DataCheckActionCode.DROP_ROWS, - metadata={"rows": highly_null_rows.index.tolist()}).to_dict()) + results["warnings"].append( + DataCheckWarning( + message=warning_msg, + data_check_name=self.name, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}, + ).to_dict() + ) + results["actions"].append( + DataCheckAction( + DataCheckActionCode.DROP_ROWS, + metadata={"rows": highly_null_rows.index.tolist()}, + ).to_dict() + ) percent_null_cols = (X.isnull().mean()).to_dict() - highly_null_cols = {key: value for key, value in percent_null_cols.items() if value >= self.pct_null_threshold and value != 0} + highly_null_cols = { + key: value + for key, value in percent_null_cols.items() + if value >= self.pct_null_threshold and value != 0 + } warning_msg = "Column '{}' is {}% or more null" - results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_null_threshold * 100), - data_check_name=self.name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": col_name, "pct_null_rows": highly_null_cols[col_name]}).to_dict() - for col_name in highly_null_cols]) - results["actions"].extend([DataCheckAction(DataCheckActionCode.DROP_COL, - metadata={"column": col_name}).to_dict() - for col_name in highly_null_cols]) + results["warnings"].extend( + [ + DataCheckWarning( + message=warning_msg.format(col_name, self.pct_null_threshold * 100), + data_check_name=self.name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={ + "column": col_name, + "pct_null_rows": highly_null_cols[col_name], + }, + ).to_dict() + for col_name in highly_null_cols + ] + ) + results["actions"].extend( + [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": col_name} + ).to_dict() + for col_name in highly_null_cols + ] + ) return results diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py index b235332d03..f3e4c59680 100644 --- a/evalml/data_checks/id_columns_data_check.py +++ b/evalml/data_checks/id_columns_data_check.py @@ -1,10 +1,9 @@ - from evalml.data_checks import ( DataCheck, DataCheckAction, DataCheckActionCode, DataCheckMessageCode, - DataCheckWarning + DataCheckWarning, ) from evalml.utils import infer_feature_types @@ -52,35 +51,60 @@ def validate(self, X, y=None): "actions": [{"code": "DROP_COL",\ "metadata": {"column": "df_id"}}]} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) col_names = [col for col in X.columns] - cols_named_id = [col for col in col_names if (str(col).lower() == "id")] # columns whose name is "id" + cols_named_id = [ + col for col in col_names if (str(col).lower() == "id") + ] # columns whose name is "id" id_cols = {col: 0.95 for col in cols_named_id} - X = X.ww.select(include=['Integer', 'Categorical']) - - check_all_unique = (X.nunique() == len(X)) - cols_with_all_unique = check_all_unique[check_all_unique].index.tolist() # columns whose values are all unique - id_cols.update([(col, 1.0) if col in id_cols else (col, 0.95) for col in cols_with_all_unique]) - - col_ends_with_id = [col for col in col_names if str(col).lower().endswith("_id")] # columns whose name ends with "_id" - id_cols.update([(col, 1.0) if str(col) in id_cols else (col, 0.95) for col in col_ends_with_id]) - - id_cols_above_threshold = {key: value for key, value in id_cols.items() if value >= self.id_threshold} + X = X.ww.select(include=["Integer", "Categorical"]) + + check_all_unique = X.nunique() == len(X) + cols_with_all_unique = check_all_unique[ + check_all_unique + ].index.tolist() # columns whose values are all unique + id_cols.update( + [ + (col, 1.0) if col in id_cols else (col, 0.95) + for col in cols_with_all_unique + ] + ) + + col_ends_with_id = [ + col for col in col_names if str(col).lower().endswith("_id") + ] # columns whose name ends with "_id" + id_cols.update( + [ + (col, 1.0) if str(col) in id_cols else (col, 0.95) + for col in col_ends_with_id + ] + ) + + id_cols_above_threshold = { + key: value for key, value in id_cols.items() if value >= self.id_threshold + } warning_msg = "Column '{}' is {}% or more likely to be an ID column" - results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.id_threshold * 100), - data_check_name=self.name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": col_name}).to_dict() - for col_name in id_cols_above_threshold]) - results["actions"].extend([DataCheckAction(DataCheckActionCode.DROP_COL, - metadata={"column": col_name}).to_dict() - for col_name in id_cols_above_threshold]) + results["warnings"].extend( + [ + DataCheckWarning( + message=warning_msg.format(col_name, self.id_threshold * 100), + data_check_name=self.name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": col_name}, + ).to_dict() + for col_name in id_cols_above_threshold + ] + ) + results["actions"].extend( + [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": col_name} + ).to_dict() + for col_name in id_cols_above_threshold + ] + ) return results diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 826b963b4c..4964e880ae 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -6,7 +6,7 @@ DataCheckActionCode, DataCheckError, DataCheckMessageCode, - DataCheckWarning + DataCheckWarning, ) from evalml.objectives import get_objective from evalml.problem_types import ( @@ -14,18 +14,15 @@ handle_problem_types, is_binary, is_multiclass, - is_regression -) -from evalml.utils.woodwork_utils import ( - infer_feature_types, - numeric_and_boolean_ww + is_regression, ) +from evalml.utils.woodwork_utils import infer_feature_types, numeric_and_boolean_ww class InvalidTargetDataCheck(DataCheck): """Checks if the target data contains missing or invalid values.""" - multiclass_continuous_threshold = .05 + multiclass_continuous_threshold = 0.05 def __init__(self, problem_type, objective, n_unique=100): """Check if the target is invalid for the specified problem type. @@ -66,44 +63,77 @@ def validate(self, X, y): "warnings": [],\ "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} if y is None: - results["errors"].append(DataCheckError(message="Target is None", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_IS_NONE, - details={}).to_dict()) + results["errors"].append( + DataCheckError( + message="Target is None", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_IS_NONE, + details={}, + ).to_dict() + ) return results y = infer_feature_types(y) - is_supported_type = y.ww.logical_type in numeric_and_boolean_ww + [ww.logical_types.Categorical] + is_supported_type = y.ww.logical_type in numeric_and_boolean_ww + [ + ww.logical_types.Categorical + ] if not is_supported_type: - results["errors"].append(DataCheckError(message="Target is unsupported {} type. Valid Woodwork logical types include: {}" - .format(y.ww.logical_type, ", ".join([ltype.type_string for ltype in numeric_and_boolean_ww])), - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, - details={"unsupported_type": y.ww.logical_type.type_string}).to_dict()) + results["errors"].append( + DataCheckError( + message="Target is unsupported {} type. Valid Woodwork logical types include: {}".format( + y.ww.logical_type, + ", ".join( + [ltype.type_string for ltype in numeric_and_boolean_ww] + ), + ), + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, + details={"unsupported_type": y.ww.logical_type.type_string}, + ).to_dict() + ) null_rows = y.isnull() if null_rows.all(): - results["errors"].append(DataCheckError(message="Target is either empty or fully null.", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, - details={}).to_dict()) + results["errors"].append( + DataCheckError( + message="Target is either empty or fully null.", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, + details={}, + ).to_dict() + ) return results elif null_rows.any(): num_null_rows = null_rows.sum() pct_null_rows = null_rows.mean() * 100 - results["errors"].append(DataCheckError(message="{} row(s) ({}%) of target values are null".format(num_null_rows, pct_null_rows), - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_HAS_NULL, - details={"num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows}).to_dict()) - impute_strategy = "mean" if is_regression(self.problem_type) else "most_frequent" - results["actions"].append(DataCheckAction(DataCheckActionCode.IMPUTE_COL, - metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()) + results["errors"].append( + DataCheckError( + message="{} row(s) ({}%) of target values are null".format( + num_null_rows, pct_null_rows + ), + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_HAS_NULL, + details={ + "num_null_rows": num_null_rows, + "pct_null_rows": pct_null_rows, + }, + ).to_dict() + ) + impute_strategy = ( + "mean" if is_regression(self.problem_type) else "most_frequent" + ) + results["actions"].append( + DataCheckAction( + DataCheckActionCode.IMPUTE_COL, + metadata={ + "column": None, + "is_target": True, + "impute_strategy": impute_strategy, + }, + ).to_dict() + ) value_counts = y.value_counts() unique_values = value_counts.index.tolist() @@ -112,50 +142,87 @@ def validate(self, X, y): if self.n_unique is None: details = {"target_values": unique_values} else: - details = {"target_values": unique_values[:min(self.n_unique, len(unique_values))]} - results["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, - details=details).to_dict()) - - if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.ww.semantic_tags: - results["errors"].append(DataCheckError(message="Target data type should be numeric for regression type problems.", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, - details={}).to_dict()) + details = { + "target_values": unique_values[ + : min(self.n_unique, len(unique_values)) + ] + } + results["errors"].append( + DataCheckError( + message="Binary class targets require exactly two unique values.", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, + details=details, + ).to_dict() + ) + + if ( + self.problem_type == ProblemTypes.REGRESSION + and "numeric" not in y.ww.semantic_tags + ): + results["errors"].append( + DataCheckError( + message="Target data type should be numeric for regression type problems.", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, + details={}, + ).to_dict() + ) if is_multiclass(self.problem_type): if value_counts.min() <= 1: least_populated = value_counts[value_counts <= 1] - details = {"least_populated_class_labels": least_populated.index.tolist()} - results["errors"].append(DataCheckError(message="Target does not have at least two instances per class which is required for multiclass classification", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, - details=details).to_dict()) + details = { + "least_populated_class_labels": least_populated.index.tolist() + } + results["errors"].append( + DataCheckError( + message="Target does not have at least two instances per class which is required for multiclass classification", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, + details=details, + ).to_dict() + ) if len(unique_values) <= 2: details = {"num_classes": len(unique_values)} - results["errors"].append(DataCheckError( - message="Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, - details=details).to_dict()) + results["errors"].append( + DataCheckError( + message="Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, + details=details, + ).to_dict() + ) num_class_to_num_value_ratio = len(unique_values) / len(y) if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold: details = {"class_to_value_ratio": num_class_to_num_value_ratio} - results["warnings"].append(DataCheckWarning( - message="Target has a large number of unique values, could be regression type problem.", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, - details=details).to_dict()) + results["warnings"].append( + DataCheckWarning( + message="Target has a large number of unique values, could be regression type problem.", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, + details=details, + ).to_dict() + ) - any_neg = not (y > 0).all() if y.ww.logical_type in [ww.logical_types.Integer, ww.logical_types.Double] else None + any_neg = ( + not (y > 0).all() + if y.ww.logical_type in [ww.logical_types.Integer, ww.logical_types.Double] + else None + ) if any_neg and self.objective.positive_only: - details = {"Count of offending values": sum(val <= 0 for val in y.values.flatten())} - results["errors"].append(DataCheckError(message=f"Target has non-positive values which is not supported for {self.objective.name}", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, - details=details).to_dict()) + details = { + "Count of offending values": sum(val <= 0 for val in y.values.flatten()) + } + results["errors"].append( + DataCheckError( + message=f"Target has non-positive values which is not supported for {self.objective.name}", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, + details=details, + ).to_dict() + ) if X is not None: X = infer_feature_types(X) @@ -164,24 +231,41 @@ def validate(self, X, y): X_length = len(X_index) y_length = len(y_index) if X_length != y_length: - results["warnings"].append(DataCheckWarning(message="Input target and features have different lengths", - data_check_name=self.name, - message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, - details={"features_length": X_length, "target_length": y_length}).to_dict()) + results["warnings"].append( + DataCheckWarning( + message="Input target and features have different lengths", + data_check_name=self.name, + message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, + details={ + "features_length": X_length, + "target_length": y_length, + }, + ).to_dict() + ) if X_index != y_index: if set(X_index) == set(y_index): - results["warnings"].append(DataCheckWarning(message="Input target and features have mismatched indices order", - data_check_name=self.name, - message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER, - details={}).to_dict()) + results["warnings"].append( + DataCheckWarning( + message="Input target and features have mismatched indices order", + data_check_name=self.name, + message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER, + details={}, + ).to_dict() + ) else: index_diff_not_in_X = list(set(y_index) - set(X_index))[:10] index_diff_not_in_y = list(set(X_index) - set(y_index))[:10] - results["warnings"].append(DataCheckWarning(message="Input target and features have mismatched indices", - data_check_name=self.name, - message_code=DataCheckMessageCode.MISMATCHED_INDICES, - details={"indices_not_in_features": index_diff_not_in_X, - "indices_not_in_target": index_diff_not_in_y}).to_dict()) + results["warnings"].append( + DataCheckWarning( + message="Input target and features have mismatched indices", + data_check_name=self.name, + message_code=DataCheckMessageCode.MISMATCHED_INDICES, + details={ + "indices_not_in_features": index_diff_not_in_X, + "indices_not_in_target": index_diff_not_in_y, + }, + ).to_dict() + ) return results diff --git a/evalml/data_checks/multicollinearity_data_check.py b/evalml/data_checks/multicollinearity_data_check.py index a72043353a..f27a1a986f 100644 --- a/evalml/data_checks/multicollinearity_data_check.py +++ b/evalml/data_checks/multicollinearity_data_check.py @@ -1,9 +1,4 @@ - -from evalml.data_checks import ( - DataCheck, - DataCheckMessageCode, - DataCheckWarning -) +from evalml.data_checks import DataCheck, DataCheckMessageCode, DataCheckWarning from evalml.utils import infer_feature_types @@ -30,22 +25,29 @@ def validate(self, X, y=None): dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) mutual_info_df = X.ww.mutual_information() if mutual_info_df.empty: return results - above_threshold = mutual_info_df.loc[mutual_info_df['mutual_info'] >= self.threshold] - correlated_cols = [(col_1, col_2) for col_1, col_2 in zip(above_threshold['column_1'], above_threshold['column_2'])] + above_threshold = mutual_info_df.loc[ + mutual_info_df["mutual_info"] >= self.threshold + ] + correlated_cols = [ + (col_1, col_2) + for col_1, col_2 in zip( + above_threshold["column_1"], above_threshold["column_2"] + ) + ] if correlated_cols: warning_msg = "Columns are likely to be correlated: {}" - results["warnings"].append(DataCheckWarning(message=warning_msg.format(correlated_cols), - data_check_name=self.name, - message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, - details={"columns": correlated_cols}).to_dict()) + results["warnings"].append( + DataCheckWarning( + message=warning_msg.format(correlated_cols), + data_check_name=self.name, + message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, + details={"columns": correlated_cols}, + ).to_dict() + ) return results diff --git a/evalml/data_checks/natural_language_nan_data_check.py b/evalml/data_checks/natural_language_nan_data_check.py index 06f2105317..f477ba4b02 100644 --- a/evalml/data_checks/natural_language_nan_data_check.py +++ b/evalml/data_checks/natural_language_nan_data_check.py @@ -8,8 +8,7 @@ class NaturalLanguageNaNDataCheck(DataCheck): """Checks if natural language columns contain NaN values.""" def __init__(self): - """Checks each column in the input for natural language features and will issue an error if NaN values are present. - """ + """Checks each column in the input for natural language features and will issue an error if NaN values are present.""" def validate(self, X, y=None): """Checks if any natural language columns contain NaN values. @@ -40,20 +39,22 @@ def validate(self, X, y=None): ... details={"columns": 'A'}).to_dict()] ... } """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) - X = X.ww.select('natural_language') + X = X.ww.select("natural_language") X_describe = X.ww.describe_dict() - nan_columns = [str(col) for col in X_describe if X_describe[col]['nan_count'] > 0] + nan_columns = [ + str(col) for col in X_describe if X_describe[col]["nan_count"] > 0 + ] if len(nan_columns) > 0: - cols_str = ', '.join(nan_columns) - results["errors"].append(DataCheckError(message=error_contains_nan.format(cols_str), - data_check_name=self.name, - message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, - details={"columns": cols_str}).to_dict()) + cols_str = ", ".join(nan_columns) + results["errors"].append( + DataCheckError( + message=error_contains_nan.format(cols_str), + data_check_name=self.name, + message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, + details={"columns": cols_str}, + ).to_dict() + ) return results diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index 15b9291ee0..9965aebe45 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -4,7 +4,7 @@ DataCheckActionCode, DataCheckError, DataCheckMessageCode, - DataCheckWarning + DataCheckWarning, ) from evalml.utils import infer_feature_types from evalml.utils.logger import get_logger @@ -39,18 +39,22 @@ def _check_for_errors(self, column_name, count_unique, any_nulls): message = f"{column_name} has {int(count_unique)} unique value." if count_unique <= 1: - return DataCheckError(message=message.format(name=column_name), - data_check_name=self.name, - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": column_name}) + return DataCheckError( + message=message.format(name=column_name), + data_check_name=self.name, + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": column_name}, + ) elif count_unique == 2 and not self._dropnan and any_nulls: - return DataCheckWarning(message=f"{column_name} has two unique values including nulls. " - "Consider encoding the nulls for " - "this column to be useful for machine learning.", - data_check_name=self.name, - message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, - details={"column": column_name}) + return DataCheckWarning( + message=f"{column_name} has two unique values including nulls. " + "Consider encoding the nulls for " + "this column to be useful for machine learning.", + data_check_name=self.name, + message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, + details={"column": column_name}, + ) def validate(self, X, y): """Check if the target or any of the features have no variance (1 unique value). @@ -62,11 +66,7 @@ def validate(self, X, y): Returns: dict: dict of warnings/errors corresponding to features or target with no variance. """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) y = infer_feature_types(y) @@ -74,16 +74,23 @@ def validate(self, X, y): unique_counts = X.nunique(dropna=self._dropnan).to_dict() any_nulls = (X.isnull().any()).to_dict() for col_name in unique_counts: - message = self._check_for_errors(col_name, unique_counts[col_name], any_nulls[col_name]) + message = self._check_for_errors( + col_name, unique_counts[col_name], any_nulls[col_name] + ) if not message: continue DataCheck._add_message(message, results) - results["actions"].append(DataCheckAction(DataCheckActionCode.DROP_COL, - metadata={"column": col_name}).to_dict()) + results["actions"].append( + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": col_name} + ).to_dict() + ) y_name = getattr(y, "name") if not y_name: y_name = "Y" - target_message = self._check_for_errors(y_name, y.nunique(dropna=self._dropnan), y.isnull().any()) + target_message = self._check_for_errors( + y_name, y.nunique(dropna=self._dropnan), y.isnull().any() + ) if target_message: DataCheck._add_message(target_message, results) return results diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index 6515b6fca4..3e5b569c36 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -1,11 +1,7 @@ import numpy as np from scipy.stats import gamma -from evalml.data_checks import ( - DataCheck, - DataCheckMessageCode, - DataCheckWarning -) +from evalml.data_checks import DataCheck, DataCheckMessageCode, DataCheckWarning from evalml.utils import infer_feature_types @@ -41,14 +37,10 @@ def validate(self, X, y=None): "errors": [],\ "actions": []} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) - X = X.ww.select('numeric') + X = X.ww.select("numeric") if len(X.columns) == 0: return results @@ -56,13 +48,21 @@ def validate(self, X, y=None): has_outliers = [] for col in X.columns: outlier_results = OutliersDataCheck._outlier_score(X[col], False) - if outlier_results is not None and outlier_results["score"] <= 0.9: # 0.9 is threshold indicating data needs improvement + if ( + outlier_results is not None and outlier_results["score"] <= 0.9 + ): # 0.9 is threshold indicating data needs improvement has_outliers.append(col) - warning_msg = "Column(s) {} are likely to have outlier data.".format(", ".join([f"'{col}'" for col in has_outliers])) - results["warnings"].append(DataCheckWarning(message=warning_msg, - data_check_name=self.name, - message_code=DataCheckMessageCode.HAS_OUTLIERS, - details={"columns": has_outliers}).to_dict()) + warning_msg = "Column(s) {} are likely to have outlier data.".format( + ", ".join([f"'{col}'" for col in has_outliers]) + ) + results["warnings"].append( + DataCheckWarning( + message=warning_msg, + data_check_name=self.name, + message_code=DataCheckMessageCode.HAS_OUTLIERS, + details={"columns": has_outliers}, + ).to_dict() + ) return results @staticmethod @@ -102,25 +102,25 @@ def _no_outlier_prob(num_records: int, pct_outliers: float) -> float: # model log_n = np.log(num_records) log_shape = ( - 25.8218734380722 + - -29.2320460088643 * log_n + - 14.8228030299864 * log_n ** 2 + - -4.08052512660036 * log_n ** 3 + - 0.641429075842177 * log_n ** 4 + - -0.0571252717322226 * log_n ** 5 + - 0.00268694343911156 * log_n ** 6 + - -5.19415149920567e-05 * log_n ** 7 + 25.8218734380722 + + -29.2320460088643 * log_n + + 14.8228030299864 * log_n ** 2 + + -4.08052512660036 * log_n ** 3 + + 0.641429075842177 * log_n ** 4 + + -0.0571252717322226 * log_n ** 5 + + 0.00268694343911156 * log_n ** 6 + + -5.19415149920567e-05 * log_n ** 7 ) shape_param = np.exp(log_shape) log_scale = ( - -19.8196822259052 + - 8.5359212447622 * log_n + - -8.80487628113388 * log_n ** 2 + - 2.27711870991327 * log_n ** 3 + - -0.344443407676357 * log_n ** 4 + - 0.029820831994345 * log_n ** 5 + - -0.00136611527293756 * log_n ** 6 + - 2.56727158170901e-05 * log_n ** 7 + -19.8196822259052 + + 8.5359212447622 * log_n + + -8.80487628113388 * log_n ** 2 + + 2.27711870991327 * log_n ** 3 + + -0.344443407676357 * log_n ** 4 + + 0.029820831994345 * log_n ** 5 + + -0.00136611527293756 * log_n ** 6 + + 2.56727158170901e-05 * log_n ** 7 ) scale_param = np.exp(log_scale) diff --git a/evalml/data_checks/sparsity_data_check.py b/evalml/data_checks/sparsity_data_check.py index 77aad7d145..9e68721917 100644 --- a/evalml/data_checks/sparsity_data_check.py +++ b/evalml/data_checks/sparsity_data_check.py @@ -3,7 +3,7 @@ DataCheckAction, DataCheckActionCode, DataCheckMessageCode, - DataCheckWarning + DataCheckWarning, ) from evalml.problem_types import handle_problem_types, is_multiclass from evalml.utils.woodwork_utils import infer_feature_types @@ -59,25 +59,35 @@ def validate(self, X, y=None): "actions": [{"code": "DROP_COL",\ "metadata": {"column": "sparse"}}]} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) - res = X.apply(SparsityDataCheck.sparsity_score, count_threshold=self.unique_count_threshold) + res = X.apply( + SparsityDataCheck.sparsity_score, + count_threshold=self.unique_count_threshold, + ) too_sparse_cols = [col for col in res.index[res < self.threshold]] - results["warnings"].extend([DataCheckWarning(message=warning_too_unique.format(col_name, - self.problem_type), - data_check_name=self.name, - message_code=DataCheckMessageCode.TOO_SPARSE, - details={"column": col_name, "sparsity_score": res.loc[col_name]}).to_dict() - for col_name in too_sparse_cols]) - results["actions"].extend([DataCheckAction(action_code=DataCheckActionCode.DROP_COL, - metadata={"column": col_name}).to_dict() - for col_name in too_sparse_cols]) + results["warnings"].extend( + [ + DataCheckWarning( + message=warning_too_unique.format(col_name, self.problem_type), + data_check_name=self.name, + message_code=DataCheckMessageCode.TOO_SPARSE, + details={"column": col_name, "sparsity_score": res.loc[col_name]}, + ).to_dict() + for col_name in too_sparse_cols + ] + ) + results["actions"].extend( + [ + DataCheckAction( + action_code=DataCheckActionCode.DROP_COL, + metadata={"column": col_name}, + ).to_dict() + for col_name in too_sparse_cols + ] + ) return results @staticmethod diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 36e94b56cc..4e6d89a6f2 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -5,12 +5,9 @@ DataCheckAction, DataCheckActionCode, DataCheckMessageCode, - DataCheckWarning -) -from evalml.utils.woodwork_utils import ( - infer_feature_types, - numeric_and_boolean_ww + DataCheckWarning, ) +from evalml.utils.woodwork_utils import infer_feature_types, numeric_and_boolean_ww class TargetLeakageDataCheck(DataCheck): @@ -29,8 +26,10 @@ def __init__(self, pct_corr_threshold=0.95, method="mutual"): """ if pct_corr_threshold < 0 or pct_corr_threshold > 1: - raise ValueError("pct_corr_threshold must be a float between 0 and 1, inclusive.") - if method not in ['mutual', 'pearson']: + raise ValueError( + "pct_corr_threshold must be a float between 0 and 1, inclusive." + ) + if method not in ["mutual", "pearson"]: raise ValueError(f"Method '{method}' not in ['mutual', 'pearson']") self.pct_corr_threshold = pct_corr_threshold self.method = method @@ -40,15 +39,24 @@ def _calculate_pearson(self, X, y): X_num = X.ww.select(include=numeric_and_boolean_ww) if y.ww.logical_type not in numeric_and_boolean_ww or len(X_num.columns) == 0: return highly_corr_cols - highly_corr_cols = [label for label, col in X_num.iteritems() if abs(y.corr(col)) >= self.pct_corr_threshold] + highly_corr_cols = [ + label + for label, col in X_num.iteritems() + if abs(y.corr(col)) >= self.pct_corr_threshold + ] return highly_corr_cols def _calculate_mutual_information(self, X, y): highly_corr_cols = [] for col in X.columns: - cols_to_compare = infer_feature_types(pd.DataFrame({col: X[col], str(col) + "y": y})) + cols_to_compare = infer_feature_types( + pd.DataFrame({col: X[col], str(col) + "y": y}) + ) mutual_info = cols_to_compare.ww.mutual_information() - if len(mutual_info) > 0 and mutual_info['mutual_info'].iloc[0] > self.pct_corr_threshold: + if ( + len(mutual_info) > 0 + and mutual_info["mutual_info"].iloc[0] > self.pct_corr_threshold + ): highly_corr_cols.append(col) return highly_corr_cols @@ -83,27 +91,34 @@ def validate(self, X, y): "actions": [{"code": "DROP_COL",\ "metadata": {"column": "leak"}}]} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) y = infer_feature_types(y) - if self.method == 'pearson': + if self.method == "pearson": highly_corr_cols = self._calculate_pearson(X, y) else: highly_corr_cols = self._calculate_mutual_information(X, y) warning_msg = "Column '{}' is {}% or more correlated with the target" - results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_corr_threshold * 100), - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": col_name}).to_dict() - for col_name in highly_corr_cols]) - results["actions"].extend([DataCheckAction(DataCheckActionCode.DROP_COL, - metadata={"column": col_name}).to_dict() - for col_name in highly_corr_cols]) + results["warnings"].extend( + [ + DataCheckWarning( + message=warning_msg.format(col_name, self.pct_corr_threshold * 100), + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": col_name}, + ).to_dict() + for col_name in highly_corr_cols + ] + ) + results["actions"].extend( + [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": col_name} + ).to_dict() + for col_name in highly_corr_cols + ] + ) return results diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py index 538c7c837f..db20a796fd 100644 --- a/evalml/data_checks/uniqueness_data_check.py +++ b/evalml/data_checks/uniqueness_data_check.py @@ -3,16 +3,14 @@ DataCheckAction, DataCheckActionCode, DataCheckMessageCode, - DataCheckWarning -) -from evalml.problem_types import ( - handle_problem_types, - is_multiclass, - is_regression + DataCheckWarning, ) +from evalml.problem_types import handle_problem_types, is_multiclass, is_regression from evalml.utils.woodwork_utils import infer_feature_types -warning_not_unique_enough = "Input columns ({}) for {} problem type are not unique enough." +warning_not_unique_enough = ( + "Input columns ({}) for {} problem type are not unique enough." +) warning_too_unique = "Input columns ({}) for {} problem type are too unique." @@ -63,11 +61,7 @@ def validate(self, X, y=None): "actions": [{"code": "DROP_COL",\ "metadata": {"column": "regression_not_unique_enough"}}]} """ - results = { - "warnings": [], - "errors": [], - "actions": [] - } + results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) @@ -75,26 +69,56 @@ def validate(self, X, y=None): if is_regression(self.problem_type): not_unique_enough_cols = list(res.index[res < self.threshold]) - results["warnings"].extend([DataCheckWarning(message=warning_not_unique_enough.format(col_name, - self.problem_type), - data_check_name=self.name, - message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, - details={"column": col_name, "uniqueness_score": res.loc[col_name]}).to_dict() - for col_name in not_unique_enough_cols]) - results["actions"].extend([DataCheckAction(action_code=DataCheckActionCode.DROP_COL, - metadata={"column": col_name}).to_dict() - for col_name in not_unique_enough_cols]) + results["warnings"].extend( + [ + DataCheckWarning( + message=warning_not_unique_enough.format( + col_name, self.problem_type + ), + data_check_name=self.name, + message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, + details={ + "column": col_name, + "uniqueness_score": res.loc[col_name], + }, + ).to_dict() + for col_name in not_unique_enough_cols + ] + ) + results["actions"].extend( + [ + DataCheckAction( + action_code=DataCheckActionCode.DROP_COL, + metadata={"column": col_name}, + ).to_dict() + for col_name in not_unique_enough_cols + ] + ) elif is_multiclass(self.problem_type): too_unique_cols = list(res.index[res > self.threshold]) - results["warnings"].extend([DataCheckWarning(message=warning_too_unique.format(col_name, - self.problem_type), - data_check_name=self.name, - message_code=DataCheckMessageCode.TOO_UNIQUE, - details={"column": col_name, "uniqueness_score": res.loc[col_name]}).to_dict() - for col_name in too_unique_cols]) - results["actions"].extend([DataCheckAction(action_code=DataCheckActionCode.DROP_COL, - metadata={"column": col_name}).to_dict() - for col_name in too_unique_cols]) + results["warnings"].extend( + [ + DataCheckWarning( + message=warning_too_unique.format(col_name, self.problem_type), + data_check_name=self.name, + message_code=DataCheckMessageCode.TOO_UNIQUE, + details={ + "column": col_name, + "uniqueness_score": res.loc[col_name], + }, + ).to_dict() + for col_name in too_unique_cols + ] + ) + results["actions"].extend( + [ + DataCheckAction( + action_code=DataCheckActionCode.DROP_COL, + metadata={"column": col_name}, + ).to_dict() + for col_name in too_unique_cols + ] + ) return results @staticmethod diff --git a/evalml/demos/churn.py b/evalml/demos/churn.py index 659d355460..6e3d18157a 100644 --- a/evalml/demos/churn.py +++ b/evalml/demos/churn.py @@ -18,8 +18,10 @@ def load_churn(n_rows=None, verbose=True): data_folder_path = os.path.join(currdir_path, "data") churn_data_path = os.path.join(data_folder_path, "churn.csv") - return load_data(path=churn_data_path, - index="customerID", - target="Churn", - n_rows=n_rows, - verbose=verbose) + return load_data( + path=churn_data_path, + index="customerID", + target="Churn", + n_rows=n_rows, + verbose=verbose, + ) diff --git a/evalml/demos/fraud.py b/evalml/demos/fraud.py index f5a117f599..91406630e2 100644 --- a/evalml/demos/fraud.py +++ b/evalml/demos/fraud.py @@ -18,8 +18,6 @@ def load_fraud(n_rows=None, verbose=True): data_folder_path = os.path.join(currdir_path, "data") fraud_data_path = os.path.join(data_folder_path, "fraud_transactions.csv.gz") - return load_data(path=fraud_data_path, - index="id", - target="fraud", - n_rows=n_rows, - verbose=verbose) + return load_data( + path=fraud_data_path, index="id", target="fraud", n_rows=n_rows, verbose=verbose + ) diff --git a/evalml/exceptions/__init__.py b/evalml/exceptions/__init__.py index a887aef964..2a5d3fac72 100644 --- a/evalml/exceptions/__init__.py +++ b/evalml/exceptions/__init__.py @@ -1,4 +1,3 @@ - from .exceptions import ( MethodPropertyNotFoundError, PipelineNotFoundError, @@ -12,5 +11,5 @@ EnsembleMissingPipelinesError, NullsInColumnWarning, ObjectiveCreationError, - NoPositiveLabelException + NoPositiveLabelException, ) diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py index 250c7f16af..12abef5e65 100644 --- a/evalml/exceptions/exceptions.py +++ b/evalml/exceptions/exceptions.py @@ -1,40 +1,48 @@ class MethodPropertyNotFoundError(Exception): """Exception to raise when a class is does not have an expected method or property.""" + pass class PipelineNotFoundError(Exception): """An exception raised when a particular pipeline is not found in automl search results""" + pass class ObjectiveNotFoundError(Exception): """Exception to raise when specified objective does not exist.""" + pass class MissingComponentError(Exception): """An exception raised when a component is not found in all_components()""" + pass class ComponentNotYetFittedError(Exception): """An exception to be raised when predict/predict_proba/transform is called on a component without fitting first.""" + pass class PipelineNotYetFittedError(Exception): """An exception to be raised when predict/predict_proba/transform is called on a pipeline without fitting first.""" + pass class AutoMLSearchException(Exception): """Exception raised when all pipelines in an automl batch return a score of NaN for the primary objective.""" + pass class EnsembleMissingPipelinesError(Exception): """An exception raised when an ensemble is missing `estimators` (list) as a parameter.""" + pass @@ -55,7 +63,9 @@ def __init__(self, exceptions, scored_successfully): # Format the traceback message exception_list = [] for objective, (exception, tb) in exceptions.items(): - exception_list.append(f"{objective} encountered {str(exception.__class__.__name__)} with message ({str(exception)}):\n") + exception_list.append( + f"{objective} encountered {str(exception.__class__.__name__)} with message ({str(exception)}):\n" + ) exception_list.extend(tb) message = "\n".join(exception_list) diff --git a/evalml/model_family/model_family.py b/evalml/model_family/model_family.py index 42ce7f848a..85f23d8a05 100644 --- a/evalml/model_family/model_family.py +++ b/evalml/model_family/model_family.py @@ -4,59 +4,61 @@ class ModelFamily(Enum): """Enum for family of machine learning models.""" - K_NEIGHBORS = 'k_neighbors' + K_NEIGHBORS = "k_neighbors" """K Nearest Neighbors model family.""" - RANDOM_FOREST = 'random_forest' + RANDOM_FOREST = "random_forest" """Random Forest model family.""" - SVM = 'svm' + SVM = "svm" """SVM model family.""" - XGBOOST = 'xgboost' + XGBOOST = "xgboost" """XGBoost model family.""" - LIGHTGBM = 'lightgbm' + LIGHTGBM = "lightgbm" """LightGBM model family.""" - LINEAR_MODEL = 'linear_model' + LINEAR_MODEL = "linear_model" """Linear model family.""" - CATBOOST = 'catboost' + CATBOOST = "catboost" """CatBoost model family.""" - EXTRA_TREES = 'extra_trees' + EXTRA_TREES = "extra_trees" """Extra Trees model family.""" - ENSEMBLE = 'ensemble' + ENSEMBLE = "ensemble" """Ensemble model family.""" - DECISION_TREE = 'decision_tree' + DECISION_TREE = "decision_tree" """Decision Tree model family.""" - ARIMA = 'arima' + ARIMA = "arima" """ARIMA model family.""" - BASELINE = 'baseline' + BASELINE = "baseline" """Baseline model family.""" - NONE = 'none' + NONE = "none" """None""" def __str__(self): - model_family_dict = {ModelFamily.K_NEIGHBORS.name: "K Nearest Neighbors", - ModelFamily.RANDOM_FOREST.name: "Random Forest", - ModelFamily.SVM.name: "SVM", - ModelFamily.XGBOOST.name: "XGBoost", - ModelFamily.LIGHTGBM.name: "LightGBM", - ModelFamily.LINEAR_MODEL.name: "Linear", - ModelFamily.CATBOOST.name: "CatBoost", - ModelFamily.EXTRA_TREES.name: "Extra Trees", - ModelFamily.DECISION_TREE.name: "Decision Tree", - ModelFamily.BASELINE.name: "Baseline", - ModelFamily.ENSEMBLE.name: "Ensemble", - ModelFamily.ARIMA.name: "ARIMA", - ModelFamily.NONE.name: "None"} + model_family_dict = { + ModelFamily.K_NEIGHBORS.name: "K Nearest Neighbors", + ModelFamily.RANDOM_FOREST.name: "Random Forest", + ModelFamily.SVM.name: "SVM", + ModelFamily.XGBOOST.name: "XGBoost", + ModelFamily.LIGHTGBM.name: "LightGBM", + ModelFamily.LINEAR_MODEL.name: "Linear", + ModelFamily.CATBOOST.name: "CatBoost", + ModelFamily.EXTRA_TREES.name: "Extra Trees", + ModelFamily.DECISION_TREE.name: "Decision Tree", + ModelFamily.BASELINE.name: "Baseline", + ModelFamily.ENSEMBLE.name: "Ensemble", + ModelFamily.ARIMA.name: "ARIMA", + ModelFamily.NONE.name: "None", + } return model_family_dict[self.name] def __repr__(self): @@ -64,6 +66,12 @@ def __repr__(self): def is_tree_estimator(self): """Checks whether the estimator's model family uses trees.""" - tree_estimators = {self.CATBOOST, self.EXTRA_TREES, self.RANDOM_FOREST, - self.DECISION_TREE, self.XGBOOST, self.LIGHTGBM} + tree_estimators = { + self.CATBOOST, + self.EXTRA_TREES, + self.RANDOM_FOREST, + self.DECISION_TREE, + self.XGBOOST, + self.LIGHTGBM, + } return self in tree_estimators diff --git a/evalml/model_family/utils.py b/evalml/model_family/utils.py index de547ead7b..8e9a39813b 100644 --- a/evalml/model_family/utils.py +++ b/evalml/model_family/utils.py @@ -16,7 +16,7 @@ def handle_model_family(model_family): tpe = ModelFamily[model_family.upper()] return tpe except KeyError: - raise KeyError('Model family \'{}\' does not exist'.format(model_family)) + raise KeyError("Model family '{}' does not exist".format(model_family)) if isinstance(model_family, ModelFamily): return model_family - raise ValueError('`handle_model_family` was not passed a str or ModelFamily object') + raise ValueError("`handle_model_family` was not passed a str or ModelFamily object") diff --git a/evalml/model_understanding/__init__.py b/evalml/model_understanding/__init__.py index 587d406bc8..fb5bdc7238 100644 --- a/evalml/model_understanding/__init__.py +++ b/evalml/model_understanding/__init__.py @@ -17,13 +17,10 @@ partial_dependence, precision_recall_curve, roc_curve, - t_sne -) -from .prediction_explanations import ( - explain_predictions, - explain_predictions_best_worst + t_sne, ) +from .prediction_explanations import explain_predictions, explain_predictions_best_worst from .permutation_importance import ( calculate_permutation_importance, - calculate_permutation_importance_one_column + calculate_permutation_importance_one_column, ) diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py index 3cb983d6eb..37bc4cc122 100644 --- a/evalml/model_understanding/graphs.py +++ b/evalml/model_understanding/graphs.py @@ -11,8 +11,7 @@ from sklearn.manifold import TSNE from sklearn.metrics import auc as sklearn_auc from sklearn.metrics import confusion_matrix as sklearn_confusion_matrix -from sklearn.metrics import \ - precision_recall_curve as sklearn_precision_recall_curve +from sklearn.metrics import precision_recall_curve as sklearn_precision_recall_curve from sklearn.metrics import roc_curve as sklearn_roc_curve from sklearn.preprocessing import LabelBinarizer from sklearn.tree import export_graphviz @@ -22,14 +21,14 @@ from evalml.exceptions import NoPositiveLabelException, NullsInColumnWarning from evalml.model_family import ModelFamily from evalml.model_understanding.permutation_importance import ( - calculate_permutation_importance + calculate_permutation_importance, ) from evalml.objectives.utils import get_objective from evalml.problem_types import ProblemTypes from evalml.utils import import_or_raise, infer_feature_types, jupyter_check -def confusion_matrix(y_true, y_predicted, normalize_method='true'): +def confusion_matrix(y_true, y_predicted, normalize_method="true"): """Confusion matrix for binary and multiclass classification. Arguments: @@ -52,7 +51,7 @@ def confusion_matrix(y_true, y_predicted, normalize_method='true'): return conf_mat -def normalize_confusion_matrix(conf_mat, normalize_method='true'): +def normalize_confusion_matrix(conf_mat, normalize_method="true"): """Normalizes a confusion matrix. Arguments: @@ -67,21 +66,29 @@ def normalize_confusion_matrix(conf_mat, normalize_method='true'): conf_mat = conf_mat.to_numpy() with warnings.catch_warnings(record=True) as w: - if normalize_method == 'true': - conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis] - elif normalize_method == 'pred': - conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=0) - elif normalize_method == 'all': - conf_mat = conf_mat.astype('float') / conf_mat.sum().sum() + if normalize_method == "true": + conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis] + elif normalize_method == "pred": + conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=0) + elif normalize_method == "all": + conf_mat = conf_mat.astype("float") / conf_mat.sum().sum() else: - raise ValueError('Invalid value provided for "normalize_method": {}'.format(normalize_method)) + raise ValueError( + 'Invalid value provided for "normalize_method": {}'.format( + normalize_method + ) + ) if w and "invalid value encountered in" in str(w[0].message): - raise ValueError("Sum of given axis is 0 and normalization is not possible. Please select another option.") + raise ValueError( + "Sum of given axis is 0 and normalization is not possible. Please select another option." + ) conf_mat = pd.DataFrame(conf_mat, index=col_names, columns=col_names) return conf_mat -def graph_confusion_matrix(y_true, y_pred, normalize_method='true', title_addition=None): +def graph_confusion_matrix( + y_true, y_pred, normalize_method="true", title_addition=None +): """Generate and display a confusion matrix plot. If `normalize_method` is set, hover text will show raw count, otherwise hover text will show count normalized with method 'true'. @@ -95,37 +102,67 @@ def graph_confusion_matrix(y_true, y_pred, normalize_method='true', title_additi Returns: plotly.Figure representing the confusion matrix plot generated """ - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") - _ff = import_or_raise("plotly.figure_factory", error_msg="Cannot find dependency plotly.figure_factory") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) + _ff = import_or_raise( + "plotly.figure_factory", + error_msg="Cannot find dependency plotly.figure_factory", + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) conf_mat = confusion_matrix(y_true, y_pred, normalize_method=None) - conf_mat_normalized = confusion_matrix(y_true, y_pred, normalize_method=normalize_method or 'true') + conf_mat_normalized = confusion_matrix( + y_true, y_pred, normalize_method=normalize_method or "true" + ) labels = conf_mat.columns.tolist() - title = 'Confusion matrix{}{}'.format( - '' if title_addition is None else (' ' + title_addition), - '' if normalize_method is None else (', normalized using method "' + normalize_method + '"')) - z_data, custom_data = (conf_mat, conf_mat_normalized) if normalize_method is None else (conf_mat_normalized, conf_mat) + title = "Confusion matrix{}{}".format( + "" if title_addition is None else (" " + title_addition), + "" + if normalize_method is None + else (', normalized using method "' + normalize_method + '"'), + ) + z_data, custom_data = ( + (conf_mat, conf_mat_normalized) + if normalize_method is None + else (conf_mat_normalized, conf_mat) + ) z_data = z_data.to_numpy() z_text = [["{:.3f}".format(y) for y in x] for x in z_data] - primary_heading, secondary_heading = ('Raw', 'Normalized') if normalize_method is None else ('Normalized', 'Raw') - hover_text = '
' + primary_heading + ' Count: %{z}
' + secondary_heading + ' Count: %{customdata}
' + primary_heading, secondary_heading = ( + ("Raw", "Normalized") if normalize_method is None else ("Normalized", "Raw") + ) + hover_text = ( + "
" + + primary_heading + + " Count: %{z}
" + + secondary_heading + + " Count: %{customdata}
" + ) # the " tags at the end are necessary to remove unwanted trace info - hover_template = 'True: %{y}
Predicted: %{x}' + hover_text + '' - layout = _go.Layout(title={'text': title}, - xaxis={'title': 'Predicted Label', 'type': 'category', 'tickvals': labels}, - yaxis={'title': 'True Label', 'type': 'category', 'tickvals': labels}) - fig = _ff.create_annotated_heatmap(z_data, x=labels, y=labels, - annotation_text=z_text, - customdata=custom_data, - hovertemplate=hover_template, - colorscale='Blues', - showscale=True) + hover_template = ( + "True: %{y}
Predicted: %{x}" + hover_text + "" + ) + layout = _go.Layout( + title={"text": title}, + xaxis={"title": "Predicted Label", "type": "category", "tickvals": labels}, + yaxis={"title": "True Label", "type": "category", "tickvals": labels}, + ) + fig = _ff.create_annotated_heatmap( + z_data, + x=labels, + y=labels, + annotation_text=z_text, + customdata=custom_data, + hovertemplate=hover_template, + colorscale="Blues", + showscale=True, + ) fig.update_layout(layout) # put xaxis text on bottom to not overlap with title - fig['layout']['xaxis'].update(side='bottom') + fig["layout"]["xaxis"].update(side="bottom") # plotly Heatmap y axis defaults to the reverse of what we want: https://community.plotly.com/t/heatmap-y-axis-is-reversed-by-default-going-against-standard-convention-for-matrices/32180 fig.update_yaxes(autorange="reversed") return fig @@ -156,14 +193,18 @@ def precision_recall_curve(y_true, y_pred_proba, pos_label_idx=-1): try: y_pred_proba = y_pred_proba.iloc[:, pos_label_idx] except IndexError: - raise NoPositiveLabelException(f"Predicted probabilities of shape {y_pred_proba_shape} don't contain a column at index {pos_label_idx}") + raise NoPositiveLabelException( + f"Predicted probabilities of shape {y_pred_proba_shape} don't contain a column at index {pos_label_idx}" + ) precision, recall, thresholds = sklearn_precision_recall_curve(y_true, y_pred_proba) auc_score = sklearn_auc(recall, precision) - return {'precision': precision, - 'recall': recall, - 'thresholds': thresholds, - 'auc_score': auc_score} + return { + "precision": precision, + "recall": recall, + "thresholds": thresholds, + "auc_score": auc_score, + } def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): @@ -177,18 +218,31 @@ def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None): Returns: plotly.Figure representing the precision-recall plot generated """ - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) precision_recall_curve_data = precision_recall_curve(y_true, y_pred_proba) - title = 'Precision-Recall{}'.format('' if title_addition is None else (' ' + title_addition)) - layout = _go.Layout(title={'text': title}, - xaxis={'title': 'Recall', 'range': [-0.05, 1.05]}, - yaxis={'title': 'Precision', 'range': [-0.05, 1.05]}) + title = "Precision-Recall{}".format( + "" if title_addition is None else (" " + title_addition) + ) + layout = _go.Layout( + title={"text": title}, + xaxis={"title": "Recall", "range": [-0.05, 1.05]}, + yaxis={"title": "Precision", "range": [-0.05, 1.05]}, + ) data = [] - data.append(_go.Scatter(x=precision_recall_curve_data['recall'], y=precision_recall_curve_data['precision'], - name='Precision-Recall (AUC {:06f})'.format(precision_recall_curve_data['auc_score']), - line=dict(width=3))) + data.append( + _go.Scatter( + x=precision_recall_curve_data["recall"], + y=precision_recall_curve_data["precision"], + name="Precision-Recall (AUC {:06f})".format( + precision_recall_curve_data["auc_score"] + ), + line=dict(width=3), + ) + ) return _go.Figure(layout=layout, data=data) @@ -226,12 +280,18 @@ def roc_curve(y_true, y_pred_proba): curve_data = [] for i in range(n_classes): - fpr_rates, tpr_rates, thresholds = sklearn_roc_curve(y_one_hot_true[:, i], y_pred_proba[:, i]) + fpr_rates, tpr_rates, thresholds = sklearn_roc_curve( + y_one_hot_true[:, i], y_pred_proba[:, i] + ) auc_score = sklearn_auc(fpr_rates, tpr_rates) - curve_data.append({'fpr_rates': fpr_rates, - 'tpr_rates': tpr_rates, - 'thresholds': thresholds, - 'auc_score': auc_score}) + curve_data.append( + { + "fpr_rates": fpr_rates, + "tpr_rates": tpr_rates, + "thresholds": thresholds, + "auc_score": auc_score, + } + ) return curve_data @@ -248,14 +308,20 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio Returns: plotly.Figure representing the ROC plot generated """ - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) - title = 'Receiver Operating Characteristic{}'.format('' if title_addition is None else (' ' + title_addition)) - layout = _go.Layout(title={'text': title}, - xaxis={'title': 'False Positive Rate', 'range': [-0.05, 1.05]}, - yaxis={'title': 'True Positive Rate', 'range': [-0.05, 1.05]}) + title = "Receiver Operating Characteristic{}".format( + "" if title_addition is None else (" " + title_addition) + ) + layout = _go.Layout( + title={"text": title}, + xaxis={"title": "False Positive Rate", "range": [-0.05, 1.05]}, + yaxis={"title": "True Positive Rate", "range": [-0.05, 1.05]}, + ) all_curve_data = roc_curve(y_true, y_pred_proba) graph_data = [] @@ -263,19 +329,29 @@ def graph_roc_curve(y_true, y_pred_proba, custom_class_names=None, title_additio n_classes = len(all_curve_data) if custom_class_names and len(custom_class_names) != n_classes: - raise ValueError('Number of custom class names does not match number of classes') + raise ValueError( + "Number of custom class names does not match number of classes" + ) for i in range(n_classes): roc_curve_data = all_curve_data[i] name = i + 1 if custom_class_names is None else custom_class_names[i] - graph_data.append(_go.Scatter(x=roc_curve_data['fpr_rates'], y=roc_curve_data['tpr_rates'], - hovertemplate="(False Postive Rate: %{x}, True Positive Rate: %{y})
" + "Threshold: %{text}", - name=f"Class {name} (AUC {roc_curve_data['auc_score']:.06f})", - text=roc_curve_data["thresholds"], - line=dict(width=3))) - graph_data.append(_go.Scatter(x=[0, 1], y=[0, 1], - name='Trivial Model (AUC 0.5)', - line=dict(dash='dash'))) + graph_data.append( + _go.Scatter( + x=roc_curve_data["fpr_rates"], + y=roc_curve_data["tpr_rates"], + hovertemplate="(False Postive Rate: %{x}, True Positive Rate: %{y})
" + + "Threshold: %{text}", + name=f"Class {name} (AUC {roc_curve_data['auc_score']:.06f})", + text=roc_curve_data["thresholds"], + line=dict(width=3), + ) + ) + graph_data.append( + _go.Scatter( + x=[0, 1], y=[0, 1], name="Trivial Model (AUC 0.5)", line=dict(dash="dash") + ) + ) return _go.Figure(layout=layout, data=graph_data) @@ -292,37 +368,46 @@ def graph_permutation_importance(pipeline, X, y, objective, importance_threshold Returns: plotly.Figure, a bar graph showing features and their respective permutation importance. """ - go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) perm_importance = calculate_permutation_importance(pipeline, X, y, objective) - perm_importance['importance'] = perm_importance['importance'] + perm_importance["importance"] = perm_importance["importance"] if importance_threshold < 0: - raise ValueError(f'Provided importance threshold of {importance_threshold} must be greater than or equal to 0') + raise ValueError( + f"Provided importance threshold of {importance_threshold} must be greater than or equal to 0" + ) # Remove features with close to zero importance - perm_importance = perm_importance[abs(perm_importance['importance']) >= importance_threshold] + perm_importance = perm_importance[ + abs(perm_importance["importance"]) >= importance_threshold + ] # List is reversed to go from ascending order to descending order perm_importance = perm_importance.iloc[::-1] title = "Permutation Importance" - subtitle = "The relative importance of each input feature's "\ - "overall influence on the pipelines' predictions, computed using "\ - "the permutation importance algorithm." - data = [go.Bar(x=perm_importance['importance'], - y=perm_importance['feature'], - orientation='h' - )] + subtitle = ( + "The relative importance of each input feature's " + "overall influence on the pipelines' predictions, computed using " + "the permutation importance algorithm." + ) + data = [ + go.Bar( + x=perm_importance["importance"], + y=perm_importance["feature"], + orientation="h", + ) + ] layout = { - 'title': '{0}
{1}'.format(title, subtitle), - 'height': 800, - 'xaxis_title': 'Permutation Importance', - 'yaxis_title': 'Feature', - 'yaxis': { - 'type': 'category' - } + "title": "{0}
{1}".format(title, subtitle), + "height": 800, + "xaxis_title": "Permutation Importance", + "yaxis_title": "Feature", + "yaxis": {"type": "category"}, } fig = go.Figure(data=data, layout=layout) @@ -346,7 +431,9 @@ def binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): """ objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(ProblemTypes.BINARY): - raise ValueError("`binary_objective_vs_threshold` can only be calculated for binary classification objectives") + raise ValueError( + "`binary_objective_vs_threshold` can only be calculated for binary classification objectives" + ) if objective.score_needs_proba: raise ValueError("Objective `score_needs_proba` must be False") @@ -375,20 +462,25 @@ def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100): plotly.Figure representing the objective score vs. threshold graph generated """ - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) objective = get_objective(objective, return_instance=True) df = binary_objective_vs_threshold(pipeline, X, y, objective, steps) - title = f'{objective.name} Scores vs. Thresholds' - layout = _go.Layout(title={'text': title}, - xaxis={'title': 'Threshold', 'range': _calculate_axis_range(df['threshold'])}, - yaxis={'title': f"{objective.name} Scores vs. Binary Classification Decision Threshold", 'range': _calculate_axis_range(df['score'])}) + title = f"{objective.name} Scores vs. Thresholds" + layout = _go.Layout( + title={"text": title}, + xaxis={"title": "Threshold", "range": _calculate_axis_range(df["threshold"])}, + yaxis={ + "title": f"{objective.name} Scores vs. Binary Classification Decision Threshold", + "range": _calculate_axis_range(df["score"]), + }, + ) data = [] - data.append(_go.Scatter(x=df['threshold'], - y=df['score'], - line=dict(width=3))) + data.append(_go.Scatter(x=df["threshold"], y=df["score"], line=dict(width=3))) return _go.Figure(layout=layout, data=data) @@ -432,8 +524,10 @@ def _raise_value_error_if_any_features_all_nan(df): all_nan = [f"'{name}'" for name in all_nan] if all_nan: - raise ValueError("The following features have all NaN values and so the " - f"partial dependence cannot be computed: {', '.join(all_nan)}") + raise ValueError( + "The following features have all NaN values and so the " + f"partial dependence cannot be computed: {', '.join(all_nan)}" + ) def _raise_value_error_if_mostly_one_value(df, percentile): @@ -449,11 +543,15 @@ def _raise_value_error_if_mostly_one_value(df, percentile): values.append(str(normalized_counts.index[0])) if one_value: - raise ValueError(f"Features ({', '.join(one_value)}) are mostly one value, ({', '.join(values)}), " - f"and cannot be used to compute partial dependence. Try raising the upper percentage value.") + raise ValueError( + f"Features ({', '.join(one_value)}) are mostly one value, ({', '.join(values)}), " + f"and cannot be used to compute partial dependence. Try raising the upper percentage value." + ) -def partial_dependence(pipeline, X, features, percentiles=(0.05, 0.95), grid_resolution=100): +def partial_dependence( + pipeline, X, features, percentiles=(0.05, 0.95), grid_resolution=100 +): """Calculates one or two-way partial dependence. If a single integer or string is given for features, one-way partial dependence is calculated. If a tuple of two integers or strings is given, two-way partial dependence @@ -503,21 +601,40 @@ def partial_dependence(pipeline, X, features, percentiles=(0.05, 0.95), grid_res X = infer_feature_types(X) if isinstance(features, (list, tuple)): - is_categorical = [_is_feature_of_type(f, X, ww.logical_types.Categorical) for f in features] - is_datetime = [_is_feature_of_type(f, X, ww.logical_types.Datetime) for f in features] + is_categorical = [ + _is_feature_of_type(f, X, ww.logical_types.Categorical) for f in features + ] + is_datetime = [ + _is_feature_of_type(f, X, ww.logical_types.Datetime) for f in features + ] else: - is_categorical = [_is_feature_of_type(features, X, ww.logical_types.Categorical)] + is_categorical = [ + _is_feature_of_type(features, X, ww.logical_types.Categorical) + ] is_datetime = [_is_feature_of_type(features, X, ww.logical_types.Datetime)] if isinstance(features, (list, tuple)): if len(features) != 2: - raise ValueError("Too many features given to graph_partial_dependence. Only one or two-way partial " - "dependence is supported.") - if not (all([isinstance(x, str) for x in features]) or all([isinstance(x, int) for x in features])): - raise ValueError("Features provided must be a tuple entirely of integers or strings, not a mixture of both.") - X_features = X.ww.iloc[:, list(features)] if isinstance(features[0], int) else X.ww[list(features)] + raise ValueError( + "Too many features given to graph_partial_dependence. Only one or two-way partial " + "dependence is supported." + ) + if not ( + all([isinstance(x, str) for x in features]) + or all([isinstance(x, int) for x in features]) + ): + raise ValueError( + "Features provided must be a tuple entirely of integers or strings, not a mixture of both." + ) + X_features = ( + X.ww.iloc[:, list(features)] + if isinstance(features[0], int) + else X.ww[list(features)] + ) else: - X_features = X.ww.iloc[:, [features]] if isinstance(features, int) else X.ww[[features]] + X_features = ( + X.ww.iloc[:, [features]] if isinstance(features, int) else X.ww[[features]] + ) X_cats = X_features.ww.select("categorical") if any(is_categorical): @@ -532,7 +649,9 @@ def partial_dependence(pipeline, X, features, percentiles=(0.05, 0.95), grid_res if isinstance(features, (list, tuple)): feature_names = _get_feature_names_from_str_or_col_index(X, features) if any(is_datetime): - raise ValueError('Two-way partial dependence is not supported for datetime columns.') + raise ValueError( + "Two-way partial dependence is not supported for datetime columns." + ) if any(is_categorical): features = _put_categorical_feature_first(features, is_categorical[0]) else: @@ -541,19 +660,30 @@ def partial_dependence(pipeline, X, features, percentiles=(0.05, 0.95), grid_res if not pipeline._is_fitted: raise ValueError("Pipeline to calculate partial dependence for must be fitted") if pipeline.model_family == ModelFamily.BASELINE: - raise ValueError("Partial dependence plots are not supported for Baseline pipelines") + raise ValueError( + "Partial dependence plots are not supported for Baseline pipelines" + ) feature_list = X[feature_names] _raise_value_error_if_any_features_all_nan(feature_list) if feature_list.isnull().sum().any(): - warnings.warn("There are null values in the features, which will cause NaN values in the partial dependence output. " - "Fill in these values to remove the NaN values.", NullsInColumnWarning) + warnings.warn( + "There are null values in the features, which will cause NaN values in the partial dependence output. " + "Fill in these values to remove the NaN values.", + NullsInColumnWarning, + ) _raise_value_error_if_mostly_one_value(feature_list, percentiles[1]) wrapped = evalml.pipelines.components.utils.scikit_learn_wrapped_estimator(pipeline) - avg_pred, values = sk_partial_dependence(wrapped, X=X, features=features, percentiles=percentiles, grid_resolution=grid_resolution) + avg_pred, values = sk_partial_dependence( + wrapped, + X=X, + features=features, + percentiles=percentiles, + grid_resolution=grid_resolution, + ) classes = None if isinstance(pipeline, evalml.pipelines.BinaryClassificationPipeline): @@ -562,20 +692,33 @@ def partial_dependence(pipeline, X, features, percentiles=(0.05, 0.95), grid_res classes = pipeline.classes_ if isinstance(features, (int, str)): - data = pd.DataFrame({"feature_values": np.tile(values[0], avg_pred.shape[0]), - "partial_dependence": np.concatenate([pred for pred in avg_pred])}) + data = pd.DataFrame( + { + "feature_values": np.tile(values[0], avg_pred.shape[0]), + "partial_dependence": np.concatenate([pred for pred in avg_pred]), + } + ) elif isinstance(features, (list, tuple)): data = pd.DataFrame(avg_pred.reshape((-1, avg_pred.shape[-1]))) data.columns = values[1] data.index = np.tile(values[0], avg_pred.shape[0]) if classes is not None: - data['class_label'] = np.repeat(classes, len(values[0])) + data["class_label"] = np.repeat(classes, len(values[0])) return data -def _update_fig_with_two_way_partial_dependence(_go, fig, label_df, part_dep, features, is_categorical, - label=None, row=None, col=None): +def _update_fig_with_two_way_partial_dependence( + _go, + fig, + label_df, + part_dep, + features, + is_categorical, + label=None, + row=None, + col=None, +): """Helper for formatting the two-way partial dependence plot.""" y = label_df.index x = label_df.columns @@ -584,9 +727,14 @@ def _update_fig_with_two_way_partial_dependence(_go, fig, label_df, part_dep, fe # No features are categorical. In this case, we pass both x and y data to the Contour plot so that # plotly can figure out the axis formatting for us. kwargs = {"x": x, "y": y} - fig.update_xaxes(title=f'{features[1]}', - range=_calculate_axis_range(np.array([x for x in part_dep.columns if x != 'class_label'])), - row=row, col=col) + fig.update_xaxes( + title=f"{features[1]}", + range=_calculate_axis_range( + np.array([x for x in part_dep.columns if x != "class_label"]) + ), + row=row, + col=col, + ) fig.update_yaxes(range=_calculate_axis_range(part_dep.index), row=row, col=col) elif sum(is_categorical) == 1: # One feature is categorical. Since we put the categorical feature first, the numeric feature will be the x @@ -594,22 +742,46 @@ def _update_fig_with_two_way_partial_dependence(_go, fig, label_df, part_dep, fe # Since the y axis is a categorical value, we will set the y tickmarks ourselves. Passing y to the contour plot # in this case will "work" but the formatting will look bad. kwargs = {"x": x} - fig.update_xaxes(title=f'{features[1]}', - range=_calculate_axis_range(np.array([x for x in part_dep.columns if x != 'class_label'])), - row=row, col=col) - fig.update_yaxes(tickmode='array', tickvals=list(range(label_df.shape[0])), - ticktext=list(label_df.index), row=row, col=col) + fig.update_xaxes( + title=f"{features[1]}", + range=_calculate_axis_range( + np.array([x for x in part_dep.columns if x != "class_label"]) + ), + row=row, + col=col, + ) + fig.update_yaxes( + tickmode="array", + tickvals=list(range(label_df.shape[0])), + ticktext=list(label_df.index), + row=row, + col=col, + ) else: # Both features are categorical so we must format both axes ourselves. kwargs = {} - fig.update_yaxes(tickmode='array', tickvals=list(range(label_df.shape[0])), - ticktext=list(label_df.index), row=row, col=col) - fig.update_xaxes(tickmode='array', tickvals=list(range(label_df.shape[1])), - ticktext=list(label_df.columns), row=row, col=col) - fig.add_trace(_go.Contour(z=z, name=label, coloraxis="coloraxis", **kwargs), row=row, col=col) - - -def graph_partial_dependence(pipeline, X, features, class_label=None, grid_resolution=100): + fig.update_yaxes( + tickmode="array", + tickvals=list(range(label_df.shape[0])), + ticktext=list(label_df.index), + row=row, + col=col, + ) + fig.update_xaxes( + tickmode="array", + tickvals=list(range(label_df.shape[1])), + ticktext=list(label_df.columns), + row=row, + col=col, + ) + fig.add_trace( + _go.Contour(z=z, name=label, coloraxis="coloraxis", **kwargs), row=row, col=col + ) + + +def graph_partial_dependence( + pipeline, X, features, class_label=None, grid_resolution=100 +): """Create an one-way or two-way partial dependence plot. Passing a single integer or string as features will create a one-way partial dependence plot with the feature values plotted against the partial dependence. Passing features a tuple of int/strings will create @@ -639,43 +811,62 @@ def graph_partial_dependence(pipeline, X, features, class_label=None, grid_resol X = infer_feature_types(X) if isinstance(features, (list, tuple)): mode = "two-way" - is_categorical = [_is_feature_of_type(f, X, ww.logical_types.Categorical) for f in features] + is_categorical = [ + _is_feature_of_type(f, X, ww.logical_types.Categorical) for f in features + ] if any(is_categorical): features = _put_categorical_feature_first(features, is_categorical[0]) elif isinstance(features, (int, str)): mode = "one-way" is_categorical = _is_feature_of_type(features, X, ww.logical_types.Categorical) - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) - if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline) and class_label is not None: + if ( + isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline) + and class_label is not None + ): if class_label not in pipeline.classes_: msg = f"Class {class_label} is not one of the classes the pipeline was fit on: {', '.join(list(pipeline.classes_))}" raise ValueError(msg) - part_dep = partial_dependence(pipeline, X, features=features, grid_resolution=grid_resolution) + part_dep = partial_dependence( + pipeline, X, features=features, grid_resolution=grid_resolution + ) if mode == "two-way": title = f"Partial Dependence of '{features[0]}' vs. '{features[1]}'" - layout = _go.Layout(title={'text': title}, - xaxis={'title': f'{features[1]}'}, - yaxis={'title': f'{features[0]}'}, - showlegend=False) + layout = _go.Layout( + title={"text": title}, + xaxis={"title": f"{features[1]}"}, + yaxis={"title": f"{features[0]}"}, + showlegend=False, + ) elif mode == "one-way": feature_name = str(features) title = f"Partial Dependence of '{feature_name}'" - layout = _go.Layout(title={'text': title}, - xaxis={'title': f'{feature_name}'}, - yaxis={'title': 'Partial Dependence'}, - showlegend=False) + layout = _go.Layout( + title={"text": title}, + xaxis={"title": f"{feature_name}"}, + yaxis={"title": "Partial Dependence"}, + showlegend=False, + ) if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline): class_labels = [class_label] if class_label is not None else pipeline.classes_ - _subplots = import_or_raise("plotly.subplots", error_msg="Cannot find dependency plotly.graph_objects") + _subplots = import_or_raise( + "plotly.subplots", error_msg="Cannot find dependency plotly.graph_objects" + ) # If the user passes in a value for class_label, we want to create a 1 x 1 subplot or else there would # be an empty column in the plot and it would look awkward - rows, cols = ((len(class_labels) + 1) // 2, 2) if len(class_labels) > 1 else (1, len(class_labels)) + rows, cols = ( + ((len(class_labels) + 1) // 2, 2) + if len(class_labels) > 1 + else (1, len(class_labels)) + ) # Don't specify share_xaxis and share_yaxis so that we get tickmarks in each subplot fig = _subplots.make_subplots(rows=rows, cols=cols, subplot_titles=class_labels) @@ -683,13 +874,22 @@ def graph_partial_dependence(pipeline, X, features, class_label=None, grid_resol label_df = part_dep.loc[part_dep.class_label == label] row = (i + 2) // 2 col = (i % 2) + 1 - label_df.drop(columns=['class_label'], inplace=True) - if mode == 'two-way': - _update_fig_with_two_way_partial_dependence(_go, fig, label_df, part_dep, features, is_categorical, - label, row, col) + label_df.drop(columns=["class_label"], inplace=True) + if mode == "two-way": + _update_fig_with_two_way_partial_dependence( + _go, + fig, + label_df, + part_dep, + features, + is_categorical, + label, + row, + col, + ) elif mode == "one-way": - x = label_df['feature_values'] - y = label_df['partial_dependence'] + x = label_df["feature_values"] + y = label_df["partial_dependence"] if is_categorical: trace = _go.Bar(x=x, y=y, name=label) else: @@ -699,31 +899,49 @@ def graph_partial_dependence(pipeline, X, features, class_label=None, grid_resol fig.update_layout(layout) if mode == "two-way": - fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False) + fig.update_layout(coloraxis=dict(colorscale="Bluered_r"), showlegend=False) elif mode == "one-way": - title = f'{feature_name}' - xrange = _calculate_axis_range(part_dep['feature_values']) if not is_categorical else None - yrange = _calculate_axis_range(part_dep['partial_dependence']) + title = f"{feature_name}" + xrange = ( + _calculate_axis_range(part_dep["feature_values"]) + if not is_categorical + else None + ) + yrange = _calculate_axis_range(part_dep["partial_dependence"]) fig.update_xaxes(title=title, range=xrange) fig.update_yaxes(range=yrange) return fig else: if "class_label" in part_dep.columns: - part_dep.drop(columns=['class_label'], inplace=True) + part_dep.drop(columns=["class_label"], inplace=True) if mode == "two-way": fig = _go.Figure(layout=layout) - _update_fig_with_two_way_partial_dependence(_go, fig, part_dep, part_dep, features, is_categorical, - label="Partial Dependence", row=None, col=None) + _update_fig_with_two_way_partial_dependence( + _go, + fig, + part_dep, + part_dep, + features, + is_categorical, + label="Partial Dependence", + row=None, + col=None, + ) return fig elif mode == "one-way": if is_categorical: - trace = _go.Bar(x=part_dep['feature_values'], y=part_dep['partial_dependence'], - name="Partial Dependence") + trace = _go.Bar( + x=part_dep["feature_values"], + y=part_dep["partial_dependence"], + name="Partial Dependence", + ) else: - trace = _go.Scatter(x=part_dep['feature_values'], - y=part_dep['partial_dependence'], - name='Partial Dependence', - line=dict(width=3)) + trace = _go.Scatter( + x=part_dep["feature_values"], + y=part_dep["partial_dependence"], + name="Partial Dependence", + line=dict(width=3), + ) return _go.Figure(layout=layout, data=[trace]) @@ -753,20 +971,25 @@ def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None): """ if outlier_threshold and outlier_threshold <= 0: - raise ValueError(f"Threshold must be positive! Provided threshold is {outlier_threshold}") + raise ValueError( + f"Threshold must be positive! Provided threshold is {outlier_threshold}" + ) y_true = infer_feature_types(y_true) y_pred = infer_feature_types(y_pred) predictions = y_pred.reset_index(drop=True) actual = y_true.reset_index(drop=True) - data = pd.concat([pd.Series(predictions), - pd.Series(actual)], axis=1) - data.columns = ['prediction', 'actual'] + data = pd.concat([pd.Series(predictions), pd.Series(actual)], axis=1) + data.columns = ["prediction", "actual"] if outlier_threshold: - data['outlier'] = np.where((abs(data['prediction'] - data['actual']) >= outlier_threshold), "#ffff00", "#0000ff") + data["outlier"] = np.where( + (abs(data["prediction"] - data["actual"]) >= outlier_threshold), + "#ffff00", + "#0000ff", + ) else: - data['outlier'] = '#0000ff' + data["outlier"] = "#0000ff" return data @@ -784,36 +1007,50 @@ def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None): plotly.Figure representing the predicted vs. actual values graph """ - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) if outlier_threshold and outlier_threshold <= 0: - raise ValueError(f"Threshold must be positive! Provided threshold is {outlier_threshold}") + raise ValueError( + f"Threshold must be positive! Provided threshold is {outlier_threshold}" + ) df = get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold) data = [] - x_axis = _calculate_axis_range(df['prediction']) - y_axis = _calculate_axis_range(df['actual']) + x_axis = _calculate_axis_range(df["prediction"]) + y_axis = _calculate_axis_range(df["actual"]) x_y_line = [min(x_axis[0], y_axis[0]), max(x_axis[1], y_axis[1])] - data.append(_go.Scatter(x=x_y_line, y=x_y_line, name="y = x line", line_color='grey')) - - title = 'Predicted vs Actual Values Scatter Plot' - layout = _go.Layout(title={'text': title}, - xaxis={'title': 'Prediction', 'range': x_y_line}, - yaxis={'title': 'Actual', 'range': x_y_line}) - - for color, outlier_group in df.groupby('outlier'): + data.append( + _go.Scatter(x=x_y_line, y=x_y_line, name="y = x line", line_color="grey") + ) + + title = "Predicted vs Actual Values Scatter Plot" + layout = _go.Layout( + title={"text": title}, + xaxis={"title": "Prediction", "range": x_y_line}, + yaxis={"title": "Actual", "range": x_y_line}, + ) + + for color, outlier_group in df.groupby("outlier"): if outlier_threshold: - name = "< outlier_threshold" if color == "#0000ff" else ">= outlier_threshold" + name = ( + "< outlier_threshold" if color == "#0000ff" else ">= outlier_threshold" + ) else: name = "Values" - data.append(_go.Scatter(x=outlier_group['prediction'], - y=outlier_group['actual'], - mode='markers', - marker=_go.scatter.Marker(color=color), - name=name)) + data.append( + _go.Scatter( + x=outlier_group["prediction"], + y=outlier_group["actual"], + mode="markers", + marker=_go.scatter.Marker(color=color), + name=name, + ) + ) return _go.Figure(layout=layout, data=data) @@ -826,14 +1063,16 @@ def _tree_parse(est, feature_names): def recurse(i): if children_left[i] == children_right[i]: - return {'Value': values[i]} - return OrderedDict({ - 'Feature': feature_names[features[i]], - 'Threshold': thresholds[i], - 'Value': values[i], - 'Left_Child': recurse(children_left[i]), - 'Right_Child': recurse(children_right[i]) - }) + return {"Value": values[i]} + return OrderedDict( + { + "Feature": feature_names[features[i]], + "Threshold": thresholds[i], + "Value": values[i], + "Left_Child": recurse(children_left[i]), + "Right_Child": recurse(children_right[i]), + } + ) return recurse(0) @@ -848,10 +1087,14 @@ def decision_tree_data_from_estimator(estimator): OrderedDict: An OrderedDict of OrderedDicts describing a tree structure """ if not estimator.model_family == ModelFamily.DECISION_TREE: - raise ValueError("Tree structure reformatting is only supported for decision tree estimators") + raise ValueError( + "Tree structure reformatting is only supported for decision tree estimators" + ) if not estimator._is_fitted: - raise NotFittedError("This DecisionTree estimator is not fitted yet. Call 'fit' with appropriate arguments " - "before using this estimator.") + raise NotFittedError( + "This DecisionTree estimator is not fitted yet. Call 'fit' with appropriate arguments " + "before using this estimator." + ) est = estimator._component_obj feature_names = estimator.input_feature_names return _tree_parse(est, feature_names) @@ -867,17 +1110,23 @@ def decision_tree_data_from_pipeline(pipeline_): OrderedDict: An OrderedDict of OrderedDicts describing a tree structure """ if not pipeline_.model_family == ModelFamily.DECISION_TREE: - raise ValueError("Tree structure reformatting is only supported for decision tree estimators") + raise ValueError( + "Tree structure reformatting is only supported for decision tree estimators" + ) if not pipeline_._is_fitted: - raise NotFittedError("The DecisionTree estimator associated with this pipeline is not fitted yet. Call 'fit' " - "with appropriate arguments before using this estimator.") + raise NotFittedError( + "The DecisionTree estimator associated with this pipeline is not fitted yet. Call 'fit' " + "with appropriate arguments before using this estimator." + ) est = pipeline_.estimator._component_obj feature_names = pipeline_.input_feature_names[pipeline_.estimator.name] return _tree_parse(est, feature_names) -def visualize_decision_tree(estimator, max_depth=None, rotate=False, filled=False, filepath=None): +def visualize_decision_tree( + estimator, max_depth=None, rotate=False, filled=False, filepath=None +): """Generate an image visualizing the decision tree Arguments: @@ -894,37 +1143,58 @@ def visualize_decision_tree(estimator, max_depth=None, rotate=False, filled=Fals graphviz.Source: DOT object that can be directly displayed in Jupyter notebooks. """ if not estimator.model_family == ModelFamily.DECISION_TREE: - raise ValueError("Tree visualizations are only supported for decision tree estimators") + raise ValueError( + "Tree visualizations are only supported for decision tree estimators" + ) if max_depth and (not isinstance(max_depth, int) or not max_depth >= 0): - raise ValueError("Unknown value: '{}'. The parameter max_depth has to be a non-negative integer" - .format(max_depth)) + raise ValueError( + "Unknown value: '{}'. The parameter max_depth has to be a non-negative integer".format( + max_depth + ) + ) if not estimator._is_fitted: - raise NotFittedError("This DecisionTree estimator is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") + raise NotFittedError( + "This DecisionTree estimator is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." + ) est = estimator._component_obj - graphviz = import_or_raise('graphviz', error_msg='Please install graphviz to visualize trees.') + graphviz = import_or_raise( + "graphviz", error_msg="Please install graphviz to visualize trees." + ) graph_format = None if filepath: # Cast to str in case a Path object was passed in filepath = str(filepath) try: - f = open(filepath, 'w') + f = open(filepath, "w") f.close() except (IOError, FileNotFoundError): - raise ValueError(('Specified filepath is not writeable: {}'.format(filepath))) + raise ValueError( + ("Specified filepath is not writeable: {}".format(filepath)) + ) path_and_name, graph_format = os.path.splitext(filepath) if graph_format: graph_format = graph_format[1:].lower() # ignore the dot supported_filetypes = graphviz.backend.FORMATS if graph_format not in supported_filetypes: - raise ValueError(("Unknown format '{}'. Make sure your format is one of the " + - "following: {}").format(graph_format, supported_filetypes)) + raise ValueError( + ( + "Unknown format '{}'. Make sure your format is one of the " + + "following: {}" + ).format(graph_format, supported_filetypes) + ) else: - graph_format = 'pdf' # If the filepath has no extension default to pdf - - dot_data = export_graphviz(decision_tree=est, max_depth=max_depth, rotate=rotate, filled=filled, feature_names=estimator.input_feature_names) + graph_format = "pdf" # If the filepath has no extension default to pdf + + dot_data = export_graphviz( + decision_tree=est, + max_depth=max_depth, + rotate=rotate, + filled=filled, + feature_names=estimator.input_feature_names, + ) source_obj = graphviz.Source(source=dot_data, format=graph_format) if filepath: source_obj.render(filename=path_and_name, cleanup=True) @@ -949,9 +1219,13 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates): y = infer_feature_types(y) prediction = pipeline.predict(X, y) - return pd.DataFrame({"dates": dates.reset_index(drop=True), - "target": y.reset_index(drop=True), - "prediction": prediction.reset_index(drop=True)}) + return pd.DataFrame( + { + "dates": dates.reset_index(drop=True), + "target": y.reset_index(drop=True), + "prediction": prediction.reset_index(drop=True), + } + ) def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): @@ -966,22 +1240,40 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, dates): Returns: plotly.Figure: Showing the prediction vs actual over time. """ - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION: - raise ValueError("graph_prediction_vs_actual_over_time only supports time series regression pipelines! " - f"Received {str(pipeline.problem_type)}.") + raise ValueError( + "graph_prediction_vs_actual_over_time only supports time series regression pipelines! " + f"Received {str(pipeline.problem_type)}." + ) data = get_prediction_vs_actual_over_time_data(pipeline, X, y, dates) - data = [_go.Scatter(x=data["dates"], y=data["target"], mode='lines+markers', name="Target", - line=dict(color='#1f77b4')), - _go.Scatter(x=data["dates"], y=data["prediction"], mode='lines+markers', name='Prediction', - line=dict(color='#d62728'))] + data = [ + _go.Scatter( + x=data["dates"], + y=data["target"], + mode="lines+markers", + name="Target", + line=dict(color="#1f77b4"), + ), + _go.Scatter( + x=data["dates"], + y=data["prediction"], + mode="lines+markers", + name="Prediction", + line=dict(color="#d62728"), + ), + ] # Let plotly pick the best date format. - layout = _go.Layout(title={'text': "Prediction vs Target over time"}, - xaxis={'title': 'Time'}, - yaxis={'title': 'Target Values and Predictions'}) + layout = _go.Layout( + title={"text": "Prediction vs Target over time"}, + xaxis={"title": "Time"}, + yaxis={"title": "Target Values and Predictions"}, + ) return _go.Figure(data=data, layout=layout) @@ -997,19 +1289,32 @@ def get_linear_coefficients(estimator, features=None): pd.DataFrame: Displaying the features by importance. """ if not estimator.model_family == ModelFamily.LINEAR_MODEL: - raise ValueError("Linear coefficients are only available for linear family models") + raise ValueError( + "Linear coefficients are only available for linear family models" + ) if not estimator._is_fitted: - raise NotFittedError("This linear estimator is not fitted yet. Call 'fit' with appropriate arguments " - "before using this estimator.") + raise NotFittedError( + "This linear estimator is not fitted yet. Call 'fit' with appropriate arguments " + "before using this estimator." + ) coef_ = estimator.feature_importance - coef_ = pd.Series(coef_, name='Coefficients', index=features) + coef_ = pd.Series(coef_, name="Coefficients", index=features) coef_ = coef_.sort_values() - coef_ = pd.Series(estimator._component_obj.intercept_, index=['Intercept']).append(coef_) + coef_ = pd.Series(estimator._component_obj.intercept_, index=["Intercept"]).append( + coef_ + ) return coef_ -def t_sne(X, n_components=2, perplexity=30.0, learning_rate=200.0, metric='euclidean', **kwargs): +def t_sne( + X, + n_components=2, + perplexity=30.0, + learning_rate=200.0, + metric="euclidean", + **kwargs, +): """Get the transformed output after fitting X to the embedded space using t-SNE. Arguments: @@ -1025,17 +1330,34 @@ def t_sne(X, n_components=2, perplexity=30.0, learning_rate=200.0, metric='eucli np.ndarray (n_samples, n_components) """ if not isinstance(n_components, int) or not n_components > 0: - raise ValueError("The parameter n_components must be of type integer and greater than 0") + raise ValueError( + "The parameter n_components must be of type integer and greater than 0" + ) if not perplexity >= 0: raise ValueError("The parameter perplexity must be non-negative") X = infer_feature_types(X) - t_sne_ = TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, metric=metric, **kwargs) + t_sne_ = TSNE( + n_components=n_components, + perplexity=perplexity, + learning_rate=learning_rate, + metric=metric, + **kwargs, + ) X_new = t_sne_.fit_transform(X) return X_new -def graph_t_sne(X, n_components=2, perplexity=30.0, learning_rate=200.0, metric='euclidean', marker_line_width=2, marker_size=7, **kwargs): +def graph_t_sne( + X, + n_components=2, + perplexity=30.0, + learning_rate=200.0, + metric="euclidean", + marker_line_width=2, + marker_size=7, + **kwargs, +): """Plot high dimensional data into lower dimensional space using t-SNE . Arguments: @@ -1053,21 +1375,29 @@ def graph_t_sne(X, n_components=2, perplexity=30.0, learning_rate=200.0, metric= plotly.Figure representing the transformed data """ - _go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + _go = import_or_raise( + "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects" + ) if not marker_line_width >= 0: raise ValueError("The parameter marker_line_width must be non-negative") if not marker_size >= 0: raise ValueError("The parameter marker_size must be non-negative") - X_embedded = t_sne(X, n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, metric=metric, **kwargs) + X_embedded = t_sne( + X, + n_components=n_components, + perplexity=perplexity, + learning_rate=learning_rate, + metric=metric, + **kwargs, + ) fig = _go.Figure() - fig.add_trace(_go.Scatter( - x=X_embedded[:, 0], y=X_embedded[:, 1], - mode='markers' - )) - fig.update_traces(mode='markers', marker_line_width=marker_line_width, marker_size=marker_size) - fig.update_layout(title='t-SNE', yaxis_zeroline=False, xaxis_zeroline=False) + fig.add_trace(_go.Scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], mode="markers")) + fig.update_traces( + mode="markers", marker_line_width=marker_line_width, marker_size=marker_size + ) + fig.update_layout(title="t-SNE", yaxis_zeroline=False, xaxis_zeroline=False) return fig diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index e2b31e0870..a2420e63f9 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -7,7 +7,9 @@ from evalml.utils import infer_feature_types -def calculate_permutation_importance(pipeline, X, y, objective, n_repeats=5, n_jobs=None, random_seed=0): +def calculate_permutation_importance( + pipeline, X, y, objective, n_repeats=5, n_jobs=None, random_seed=0 +): """Calculates permutation importance for features. Arguments: @@ -27,20 +29,32 @@ def calculate_permutation_importance(pipeline, X, y, objective, n_repeats=5, n_j objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(pipeline.problem_type): - raise ValueError(f"Given objective '{objective.name}' cannot be used with '{pipeline.name}'") + raise ValueError( + f"Given objective '{objective.name}' cannot be used with '{pipeline.name}'" + ) if pipeline._supports_fast_permutation_importance: precomputed_features = pipeline.compute_estimator_features(X, y) - perm_importance = _fast_permutation_importance(pipeline, X, y, objective, - precomputed_features, - n_repeats=n_repeats, - n_jobs=n_jobs, - random_seed=random_seed) + perm_importance = _fast_permutation_importance( + pipeline, + X, + y, + objective, + precomputed_features, + n_repeats=n_repeats, + n_jobs=n_jobs, + random_seed=random_seed, + ) else: - perm_importance = _slow_permutation_importance(pipeline, X, y, objective, - n_repeats=n_repeats, - n_jobs=n_jobs, - random_seed=random_seed) + perm_importance = _slow_permutation_importance( + pipeline, + X, + y, + objective, + n_repeats=n_repeats, + n_jobs=n_jobs, + random_seed=random_seed, + ) mean_perm_importance = perm_importance["importances_mean"] feature_names = list(X.columns) @@ -49,8 +63,17 @@ def calculate_permutation_importance(pipeline, X, y, objective, n_repeats=5, n_j return pd.DataFrame(mean_perm_importance, columns=["feature", "importance"]) -def calculate_permutation_importance_one_column(pipeline, X, y, col_name, objective, - n_repeats=5, fast=True, precomputed_features=None, random_seed=0): +def calculate_permutation_importance_one_column( + pipeline, + X, + y, + col_name, + objective, + n_repeats=5, + fast=True, + precomputed_features=None, + random_seed=0, +): """Calculates permutation importance for one column in the original dataframe. Arguments: @@ -73,23 +96,47 @@ def calculate_permutation_importance_one_column(pipeline, X, y, col_name, object if fast: if not pipeline._supports_fast_permutation_importance: - raise ValueError("Pipeline does not support fast permutation importance calculation") + raise ValueError( + "Pipeline does not support fast permutation importance calculation" + ) if precomputed_features is None: - raise ValueError("Fast method of calculating permutation importance requires precomputed_features") - permutation_importance = _fast_permutation_importance(pipeline, X, y, objective, - precomputed_features, - col_name=col_name, - n_repeats=n_repeats, - random_seed=random_seed) + raise ValueError( + "Fast method of calculating permutation importance requires precomputed_features" + ) + permutation_importance = _fast_permutation_importance( + pipeline, + X, + y, + objective, + precomputed_features, + col_name=col_name, + n_repeats=n_repeats, + random_seed=random_seed, + ) else: - permutation_importance = _slow_permutation_importance(pipeline, X, y, objective, - col_name=col_name, - n_repeats=n_repeats, - random_seed=random_seed) + permutation_importance = _slow_permutation_importance( + pipeline, + X, + y, + objective, + col_name=col_name, + n_repeats=n_repeats, + random_seed=random_seed, + ) return permutation_importance["importances_mean"] -def _fast_permutation_importance(pipeline, X, y, objective, precomputed_features, col_name=None, n_repeats=5, n_jobs=None, random_seed=None): +def _fast_permutation_importance( + pipeline, + X, + y, + objective, + precomputed_features, + col_name=None, + n_repeats=5, + n_jobs=None, + random_seed=None, +): """Calculate permutation importance faster by only computing the estimator features once. Only used for pipelines that support this optimization. @@ -99,67 +146,143 @@ def _fast_permutation_importance(pipeline, X, y, objective, precomputed_features baseline_score = _fast_scorer(pipeline, precomputed_features, X, y, objective) if col_name is None: - scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores_fast)( - pipeline, precomputed_features, y, objective, col_name, random_seed, n_repeats, _fast_scorer, baseline_score, - ) for col_name in X.columns) + scores = Parallel(n_jobs=n_jobs)( + delayed(_calculate_permutation_scores_fast)( + pipeline, + precomputed_features, + y, + objective, + col_name, + random_seed, + n_repeats, + _fast_scorer, + baseline_score, + ) + for col_name in X.columns + ) importances = baseline_score - np.array(scores) - return {'importances_mean': np.mean(importances, axis=1)} + return {"importances_mean": np.mean(importances, axis=1)} else: scores = _calculate_permutation_scores_fast( - pipeline, precomputed_features, y, objective, col_name, random_seed, n_repeats, _fast_scorer, baseline_score, + pipeline, + precomputed_features, + y, + objective, + col_name, + random_seed, + n_repeats, + _fast_scorer, + baseline_score, ) importances = baseline_score - np.array(scores) - importances_mean = np.mean(importances, axis=1) if col_name is None else np.mean(importances) - return {'importances_mean': importances_mean} - - -def _calculate_permutation_scores_fast(pipeline, precomputed_features, y, objective, col_name, - random_seed, n_repeats, scorer, baseline_score): + importances_mean = ( + np.mean(importances, axis=1) if col_name is None else np.mean(importances) + ) + return {"importances_mean": importances_mean} + + +def _calculate_permutation_scores_fast( + pipeline, + precomputed_features, + y, + objective, + col_name, + random_seed, + n_repeats, + scorer, + baseline_score, +): """Calculate the permutation score when `col_name` is permuted.""" random_state = np.random.RandomState(random_seed) scores = np.zeros(n_repeats) # If column is not in the features or provenance, assume the column was dropped - if col_name not in precomputed_features.columns and col_name not in pipeline._get_feature_provenance(): + if ( + col_name not in precomputed_features.columns + and col_name not in pipeline._get_feature_provenance() + ): return scores + baseline_score if col_name in precomputed_features.columns: col_idx = precomputed_features.columns.get_loc(col_name) else: - col_idx = [precomputed_features.columns.get_loc(col) for col in pipeline._get_feature_provenance()[col_name]] - - return _shuffle_and_score_helper(pipeline, precomputed_features, y, objective, col_idx, n_repeats, scorer, random_state, is_fast=True) - - -def _slow_permutation_importance(pipeline, X, y, objective, col_name=None, n_repeats=5, n_jobs=None, random_seed=None): + col_idx = [ + precomputed_features.columns.get_loc(col) + for col in pipeline._get_feature_provenance()[col_name] + ] + + return _shuffle_and_score_helper( + pipeline, + precomputed_features, + y, + objective, + col_idx, + n_repeats, + scorer, + random_state, + is_fast=True, + ) + + +def _slow_permutation_importance( + pipeline, X, y, objective, col_name=None, n_repeats=5, n_jobs=None, random_seed=None +): """ If `col_name` is not None, calculates permutation importance for only the column with that name. Otherwise, calculates the permutation importance for all columns in the input dataframe. """ baseline_score = _slow_scorer(pipeline, X, y, objective) if col_name is None: - scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores_slow)( - pipeline, X, y, col_idx, objective, _slow_scorer, n_repeats, random_seed - ) for col_idx in range(X.shape[1])) + scores = Parallel(n_jobs=n_jobs)( + delayed(_calculate_permutation_scores_slow)( + pipeline, X, y, col_idx, objective, _slow_scorer, n_repeats, random_seed + ) + for col_idx in range(X.shape[1]) + ) else: baseline_score = _slow_scorer(pipeline, X, y, objective) - scores = _calculate_permutation_scores_slow(pipeline, X, y, col_name, objective, _slow_scorer, n_repeats, random_seed) + scores = _calculate_permutation_scores_slow( + pipeline, X, y, col_name, objective, _slow_scorer, n_repeats, random_seed + ) importances = baseline_score - np.array(scores) - importances_mean = np.mean(importances, axis=1) if col_name is None else np.mean(importances) - return {'importances_mean': importances_mean} + importances_mean = ( + np.mean(importances, axis=1) if col_name is None else np.mean(importances) + ) + return {"importances_mean": importances_mean} -def _calculate_permutation_scores_slow(estimator, X, y, col_name, objective, scorer, - n_repeats, random_seed): +def _calculate_permutation_scores_slow( + estimator, X, y, col_name, objective, scorer, n_repeats, random_seed +): """Calculate score when `col_idx` is permuted.""" random_state = np.random.RandomState(random_seed) col_idx = col_name if col_name in X.columns: col_idx = X.columns.get_loc(col_name) - return _shuffle_and_score_helper(estimator, X, y, objective, col_idx, n_repeats, scorer, random_state, is_fast=False) - - -def _shuffle_and_score_helper(pipeline, X_features, y, objective, col_idx, n_repeats, scorer, random_state, is_fast=True): + return _shuffle_and_score_helper( + estimator, + X, + y, + objective, + col_idx, + n_repeats, + scorer, + random_state, + is_fast=False, + ) + + +def _shuffle_and_score_helper( + pipeline, + X_features, + y, + objective, + col_idx, + n_repeats, + scorer, + random_state, + is_fast=True, +): scores = np.zeros(n_repeats) # This is what sk_permutation_importance does. Useful for thread safety @@ -180,7 +303,11 @@ def _shuffle_and_score_helper(pipeline, X_features, y, objective, col_idx, n_rep def _slow_scorer(pipeline, X, y, objective): scores = pipeline.score(X, y, objectives=[objective]) - return scores[objective.name] if objective.greater_is_better else -scores[objective.name] + return ( + scores[objective.name] + if objective.greater_is_better + else -scores[objective.name] + ) def _fast_scorer(pipeline, features, X, y, objective): diff --git a/evalml/model_understanding/prediction_explanations/_algorithms.py b/evalml/model_understanding/prediction_explanations/_algorithms.py index fa080541cb..80b14a1504 100644 --- a/evalml/model_understanding/prediction_explanations/_algorithms.py +++ b/evalml/model_understanding/prediction_explanations/_algorithms.py @@ -46,7 +46,9 @@ def _compute_shap_values(pipeline, features, training_data=None): """ estimator = pipeline.estimator if estimator.model_family == ModelFamily.BASELINE: - raise ValueError("You passed in a baseline pipeline. These are simple enough that SHAP values are not needed.") + raise ValueError( + "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed." + ) feature_names = features.columns @@ -59,7 +61,9 @@ def _compute_shap_values(pipeline, features, training_data=None): if estimator.model_family.is_tree_estimator(): # Use tree_path_dependent to avoid linear runtime with dataset size with warnings.catch_warnings(record=True) as ws: - explainer = shap.TreeExplainer(estimator._component_obj, feature_perturbation="tree_path_dependent") + explainer = shap.TreeExplainer( + estimator._component_obj, feature_perturbation="tree_path_dependent" + ) if ws: logger.debug(f"_compute_shap_values TreeExplainer: {ws[0].message}") shap_values = explainer.shap_values(features, check_additivity=False) @@ -67,13 +71,22 @@ def _compute_shap_values(pipeline, features, training_data=None): # this modifies the output to match the output format of other binary estimators. # Ok to fill values of negative class with zeros since the negative class will get dropped # in the UI anyways. - if estimator.model_family in {ModelFamily.CATBOOST, ModelFamily.XGBOOST} and is_binary(pipeline.problem_type): + if ( + estimator.model_family + in { + ModelFamily.CATBOOST, + ModelFamily.XGBOOST, + } + and is_binary(pipeline.problem_type) + ): shap_values = [np.zeros(shap_values.shape), shap_values] else: if training_data is None: - raise ValueError("You must pass in a value for parameter 'training_data' when the pipeline " - "does not have a tree-based estimator. " - f"Current estimator model family is {estimator.model_family}.") + raise ValueError( + "You must pass in a value for parameter 'training_data' when the pipeline " + "does not have a tree-based estimator. " + f"Current estimator model family is {estimator.model_family}." + ) # More than 100 datapoints can negatively impact runtime according to SHAP # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114 @@ -86,7 +99,9 @@ def _compute_shap_values(pipeline, features, training_data=None): link_function = "logit" decision_function = estimator._component_obj.predict_proba with warnings.catch_warnings(record=True) as ws: - explainer = shap.KernelExplainer(decision_function, sampled_training_data_features, link_function) + explainer = shap.KernelExplainer( + decision_function, sampled_training_data_features, link_function + ) shap_values = explainer.shap_values(features) if ws: logger.debug(f"_compute_shap_values KernelExplainer: {ws[0].message}") @@ -165,7 +180,10 @@ def _aggregate_shap_values(values, provenance): if isinstance(values, dict): return _aggreggate_shap_values_dict(values, provenance) else: - return [_aggreggate_shap_values_dict(class_values, provenance) for class_values in values] + return [ + _aggreggate_shap_values_dict(class_values, provenance) + for class_values in values + ] def _normalize_values_dict(values): @@ -192,7 +210,10 @@ def _normalize_values_dict(values): scaled_values = all_values / np.abs(all_values).sum(axis=1)[:, np.newaxis] - return {feature_name: scaled_values[:, i].tolist() for i, feature_name in enumerate(feature_names)} + return { + feature_name: scaled_values[:, i].tolist() + for i, feature_name in enumerate(feature_names) + } def _normalize_shap_values(values): @@ -210,4 +231,6 @@ def _normalize_shap_values(values): elif isinstance(values, list): return [_normalize_values_dict(class_values) for class_values in values] else: - raise ValueError(f"Unsupported data type for _normalize_shap_values: {str(type(values))}.") + raise ValueError( + f"Unsupported data type for _normalize_shap_values: {str(type(values))}." + ) diff --git a/evalml/model_understanding/prediction_explanations/_report_creator_factory.py b/evalml/model_understanding/prediction_explanations/_report_creator_factory.py index 619ee0d1d5..8e91415578 100644 --- a/evalml/model_understanding/prediction_explanations/_report_creator_factory.py +++ b/evalml/model_understanding/prediction_explanations/_report_creator_factory.py @@ -3,18 +3,27 @@ _Heading, _RegressionPredictedValues, _ReportMaker, - _SHAPTable + _SHAPTable, ) from evalml.problem_types import is_regression def _best_worst_predicted_values_section(data, regression, classification): """Get and initialize the predicted values section maker given the data.""" - predicted_values_class = regression if is_regression(data.pipeline.problem_type) else classification + predicted_values_class = ( + regression if is_regression(data.pipeline.problem_type) else classification + ) return predicted_values_class(data.metric.__name__, data.y_pred_values) -def _report_creator_factory(data, report_type, output_format, top_k_features, include_shap_values, num_to_explain=None): +def _report_creator_factory( + data, + report_type, + output_format, + top_k_features, + include_shap_values, + num_to_explain=None, +): """Get and initialize the report creator class given the ReportData and parameters passed in by the user. Arguments: @@ -40,21 +49,32 @@ def _report_creator_factory(data, report_type, output_format, top_k_features, in report_maker = _ReportMaker(None, None, shap_table).make_dataframe elif report_type == "explain_predictions_best_worst" and output_format == "text": heading_maker = _Heading(["Best ", "Worst "], n_indices=num_to_explain) - predicted_values = _best_worst_predicted_values_section(data, _RegressionPredictedValues, - _ClassificationPredictedValues) + predicted_values = _best_worst_predicted_values_section( + data, _RegressionPredictedValues, _ClassificationPredictedValues + ) table_maker = _SHAPTable(top_k_features, include_shap_values) - report_maker = _ReportMaker(heading_maker, predicted_values, table_maker).make_text - elif report_type == "explain_predictions_best_worst" and output_format == "dataframe": + report_maker = _ReportMaker( + heading_maker, predicted_values, table_maker + ).make_text + elif ( + report_type == "explain_predictions_best_worst" and output_format == "dataframe" + ): heading_maker = _Heading(["best", "worst"], n_indices=num_to_explain) table_maker = _SHAPTable(top_k_features, include_shap_values) - predicted_values = _best_worst_predicted_values_section(data, _RegressionPredictedValues, - _ClassificationPredictedValues) - report_maker = _ReportMaker(heading_maker, predicted_values, table_maker).make_dataframe + predicted_values = _best_worst_predicted_values_section( + data, _RegressionPredictedValues, _ClassificationPredictedValues + ) + report_maker = _ReportMaker( + heading_maker, predicted_values, table_maker + ).make_dataframe else: heading_maker = _Heading(["best", "worst"], n_indices=num_to_explain) table_maker = _SHAPTable(top_k_features, include_shap_values) - predicted_values = _best_worst_predicted_values_section(data, _RegressionPredictedValues, - _ClassificationPredictedValues) - report_maker = _ReportMaker(heading_maker, predicted_values, table_maker).make_dict + predicted_values = _best_worst_predicted_values_section( + data, _RegressionPredictedValues, _ClassificationPredictedValues + ) + report_maker = _ReportMaker( + heading_maker, predicted_values, table_maker + ).make_dict return report_maker diff --git a/evalml/model_understanding/prediction_explanations/_user_interface.py b/evalml/model_understanding/prediction_explanations/_user_interface.py index ee39577799..46c5566d38 100644 --- a/evalml/model_understanding/prediction_explanations/_user_interface.py +++ b/evalml/model_understanding/prediction_explanations/_user_interface.py @@ -6,13 +6,20 @@ from evalml.model_understanding.prediction_explanations._algorithms import ( _aggregate_shap_values, _compute_shap_values, - _normalize_shap_values + _normalize_shap_values, ) from evalml.problem_types import ProblemTypes -def _make_rows(shap_values, normalized_values, pipeline_features, original_features, top_k, include_shap_values=False, - convert_numeric_to_string=True): +def _make_rows( + shap_values, + normalized_values, + pipeline_features, + original_features, + top_k, + include_shap_values=False, + convert_numeric_to_string=True, +): """Makes the rows (one row for each feature) for the SHAP table. Arguments: @@ -29,7 +36,9 @@ def _make_rows(shap_values, normalized_values, pipeline_features, original_featu Returns: list[str] """ - tuples = [(value[0], feature_name) for feature_name, value in normalized_values.items()] + tuples = [ + (value[0], feature_name) for feature_name, value in normalized_values.items() + ] # Sort the features s.t the top_k_features w the largest shap value magnitudes are the first # top_k_features elements @@ -52,7 +61,9 @@ def _make_rows(shap_values, normalized_values, pipeline_features, original_featu feature_value = original_features[feature_name].iloc[0] if convert_numeric_to_string: - if pd.api.types.is_number(feature_value) and not pd.api.types.is_bool(feature_value): + if pd.api.types.is_number(feature_value) and not pd.api.types.is_bool( + feature_value + ): feature_value = "{:.2f}".format(feature_value) else: feature_value = str(feature_value) @@ -84,9 +95,12 @@ def _rows_to_dict(rows): qualitative_explanations.append(qualitative) quantitative_explanations.append(quantitative) - return {"feature_names": feature_names, "feature_values": feature_values, - "qualitative_explanation": qualitative_explanations, - "quantitative_explanation": quantitative_explanations} + return { + "feature_names": feature_names, + "feature_values": feature_values, + "qualitative_explanation": qualitative_explanations, + "quantitative_explanation": quantitative_explanations, + } def _make_json_serializable(value): @@ -104,8 +118,14 @@ def _make_json_serializable(value): return value -def _make_text_table(shap_values, normalized_values, pipeline_features, original_features, - top_k, include_shap_values=False): +def _make_text_table( + shap_values, + normalized_values, + pipeline_features, + original_features, + top_k, + include_shap_values=False, +): """Make a table displaying the SHAP values for a prediction. Arguments: @@ -132,7 +152,14 @@ def _make_text_table(shap_values, normalized_values, pipeline_features, original header.append("SHAP Value") rows = [header] - rows += _make_rows(shap_values, normalized_values, pipeline_features, original_features, top_k, include_shap_values) + rows += _make_rows( + shap_values, + normalized_values, + pipeline_features, + original_features, + top_k, + include_shap_values, + ) table.add_rows(rows) return table.draw() @@ -146,8 +173,14 @@ def __init__(self, top_k, include_shap_values, provenance): self.provenance = provenance @staticmethod - def make_drill_down_dict(provenance, shap_values, normalized_values, pipeline_features, - original_features, include_shap_values): + def make_drill_down_dict( + provenance, + shap_values, + normalized_values, + pipeline_features, + original_features, + include_shap_values, + ): """Format the 'drill_down' section of the explanation report when output_format="dict" This section will include the feature values, feature names, qualitative explanation @@ -156,56 +189,126 @@ def make_drill_down_dict(provenance, shap_values, normalized_values, pipeline_fe """ drill_down = {} for parent_feature, children_features in provenance.items(): - shap_for_children = {k: v for k, v in shap_values.items() if k in children_features} - agg_for_children = {k: v for k, v in normalized_values.items() if k in children_features} + shap_for_children = { + k: v for k, v in shap_values.items() if k in children_features + } + agg_for_children = { + k: v for k, v in normalized_values.items() if k in children_features + } top_k = len(agg_for_children) - rows = _make_rows(shap_for_children, agg_for_children, pipeline_features, original_features, - top_k=top_k, include_shap_values=include_shap_values, convert_numeric_to_string=False) + rows = _make_rows( + shap_for_children, + agg_for_children, + pipeline_features, + original_features, + top_k=top_k, + include_shap_values=include_shap_values, + convert_numeric_to_string=False, + ) drill_down[parent_feature] = _rows_to_dict(rows) return drill_down @abc.abstractmethod - def make_text(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): + def make_text( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): """Creates a table given shap values and formats it as text.""" @abc.abstractmethod - def make_dict(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): + def make_dict( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): """Creates a table given shap values and formats it as dictionary.""" - def make_dataframe(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): - data = self.make_dict(aggregated_shap_values, aggregated_normalized_values, - shap_values=shap_values, normalized_values=normalized_values, - pipeline_features=pipeline_features, original_features=original_features)['explanations'] + def make_dataframe( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): + data = self.make_dict( + aggregated_shap_values, + aggregated_normalized_values, + shap_values=shap_values, + normalized_values=normalized_values, + pipeline_features=pipeline_features, + original_features=original_features, + )["explanations"] # Not including the drill down dict for dataframes # 'drill_down' is always included in the dict output so we can delete it for d in data: - del d['drill_down'] + del d["drill_down"] df = pd.concat(map(pd.DataFrame, data)).reset_index(drop=True) - if "class_name" in df.columns and df['class_name'].isna().all(): - df = df.drop(columns=['class_name']) + if "class_name" in df.columns and df["class_name"].isna().all(): + df = df.drop(columns=["class_name"]) return df class _RegressionSHAPTable(_TableMaker): """Makes a SHAP table explaining a prediction for a regression problems.""" - def make_text(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): - return _make_text_table(aggregated_shap_values, aggregated_normalized_values, pipeline_features, original_features, - self.top_k, self.include_shap_values) - - def make_dict(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): - rows = _make_rows(aggregated_shap_values, aggregated_normalized_values, pipeline_features, original_features, - self.top_k, self.include_shap_values, convert_numeric_to_string=False) + def make_text( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): + return _make_text_table( + aggregated_shap_values, + aggregated_normalized_values, + pipeline_features, + original_features, + self.top_k, + self.include_shap_values, + ) + + def make_dict( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): + rows = _make_rows( + aggregated_shap_values, + aggregated_normalized_values, + pipeline_features, + original_features, + self.top_k, + self.include_shap_values, + convert_numeric_to_string=False, + ) json_rows = _rows_to_dict(rows) - drill_down = self.make_drill_down_dict(self.provenance, shap_values, normalized_values, - pipeline_features, original_features, self.include_shap_values) + drill_down = self.make_drill_down_dict( + self.provenance, + shap_values, + normalized_values, + pipeline_features, + original_features, + self.include_shap_values, + ) json_rows["class_name"] = None json_rows["drill_down"] = drill_down return {"explanations": [json_rows]} @@ -218,20 +321,53 @@ def __init__(self, top_k, include_shap_values, class_names, provenance): super().__init__(top_k, include_shap_values, provenance) self.class_names = class_names - def make_text(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): + def make_text( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): # The SHAP algorithm will return a two-element list for binary problems. # By convention, we display the explanation for the dominant class. - return _make_text_table(aggregated_shap_values[1], aggregated_normalized_values[1], - pipeline_features, original_features, self.top_k, self.include_shap_values) - - def make_dict(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): - rows = _make_rows(aggregated_shap_values[1], aggregated_normalized_values[1], pipeline_features, original_features, - self.top_k, self.include_shap_values, convert_numeric_to_string=False) + return _make_text_table( + aggregated_shap_values[1], + aggregated_normalized_values[1], + pipeline_features, + original_features, + self.top_k, + self.include_shap_values, + ) + + def make_dict( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): + rows = _make_rows( + aggregated_shap_values[1], + aggregated_normalized_values[1], + pipeline_features, + original_features, + self.top_k, + self.include_shap_values, + convert_numeric_to_string=False, + ) dict_rows = _rows_to_dict(rows) - drill_down = self.make_drill_down_dict(self.provenance, shap_values[1], normalized_values[1], - pipeline_features, original_features, self.include_shap_values) + drill_down = self.make_drill_down_dict( + self.provenance, + shap_values[1], + normalized_values[1], + pipeline_features, + original_features, + self.include_shap_values, + ) dict_rows["drill_down"] = drill_down dict_rows["class_name"] = _make_json_serializable(self.class_names[1]) return {"explanations": [dict_rows]} @@ -244,37 +380,76 @@ def __init__(self, top_k, include_shap_values, class_names, provenance): super().__init__(top_k, include_shap_values, provenance) self.class_names = class_names - def make_text(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): + def make_text( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): strings = [] - for class_name, class_values, normalized_class_values in zip(self.class_names, aggregated_shap_values, - aggregated_normalized_values): + for class_name, class_values, normalized_class_values in zip( + self.class_names, aggregated_shap_values, aggregated_normalized_values + ): strings.append(f"Class: {class_name}\n") - table = _make_text_table(class_values, normalized_class_values, pipeline_features, original_features, - self.top_k, self.include_shap_values) + table = _make_text_table( + class_values, + normalized_class_values, + pipeline_features, + original_features, + self.top_k, + self.include_shap_values, + ) strings += table.splitlines() strings.append("\n") return "\n".join(strings) - def make_dict(self, aggregated_shap_values, aggregated_normalized_values, - shap_values, normalized_values, pipeline_features, original_features): + def make_dict( + self, + aggregated_shap_values, + aggregated_normalized_values, + shap_values, + normalized_values, + pipeline_features, + original_features, + ): json_output = [] for class_index, class_name in enumerate(self.class_names): - rows = _make_rows(aggregated_shap_values[class_index], aggregated_normalized_values[class_index], - pipeline_features, original_features, self.top_k, - self.include_shap_values, convert_numeric_to_string=False) + rows = _make_rows( + aggregated_shap_values[class_index], + aggregated_normalized_values[class_index], + pipeline_features, + original_features, + self.top_k, + self.include_shap_values, + convert_numeric_to_string=False, + ) json_output_for_class = _rows_to_dict(rows) - drill_down = self.make_drill_down_dict(self.provenance, shap_values[class_index], - normalized_values[class_index], pipeline_features, original_features, - self.include_shap_values) + drill_down = self.make_drill_down_dict( + self.provenance, + shap_values[class_index], + normalized_values[class_index], + pipeline_features, + original_features, + self.include_shap_values, + ) json_output_for_class["drill_down"] = drill_down json_output_for_class["class_name"] = _make_json_serializable(class_name) json_output.append(json_output_for_class) return {"explanations": json_output} -def _make_single_prediction_shap_table(pipeline, pipeline_features, input_features, index_to_explain, top_k=3, - include_shap_values=False, output_format="text"): +def _make_single_prediction_shap_table( + pipeline, + pipeline_features, + input_features, + index_to_explain, + top_k=3, + include_shap_values=False, + output_format="text", +): """Creates table summarizing the top_k_features positive and top_k_features negative contributing features to the prediction of a single datapoint. Arguments: @@ -299,9 +474,13 @@ def _make_single_prediction_shap_table(pipeline, pipeline_features, input_featur pipeline_features_row = pipeline_features.iloc[[index_to_explain]] input_features_row = input_features.iloc[[index_to_explain]] if pipeline_features_row.isna().any(axis=None): - raise ValueError(f"Requested index ({index_to_explain}) produces NaN in features.") + raise ValueError( + f"Requested index ({index_to_explain}) produces NaN in features." + ) - shap_values = _compute_shap_values(pipeline, pipeline_features_row, training_data=pipeline_features.dropna(axis=0)) + shap_values = _compute_shap_values( + pipeline, pipeline_features_row, training_data=pipeline_features.dropna(axis=0) + ) normalized_values = _normalize_shap_values(shap_values) provenance = pipeline._get_feature_provenance() @@ -312,19 +491,42 @@ def _make_single_prediction_shap_table(pipeline, pipeline_features, input_featur if hasattr(pipeline, "classes_"): class_names = pipeline.classes_ - table_makers = {ProblemTypes.REGRESSION: _RegressionSHAPTable(top_k, include_shap_values, provenance), - ProblemTypes.BINARY: _BinarySHAPTable(top_k, include_shap_values, class_names, provenance), - ProblemTypes.MULTICLASS: _MultiClassSHAPTable(top_k, include_shap_values, class_names, provenance), - ProblemTypes.TIME_SERIES_REGRESSION: _RegressionSHAPTable(top_k, include_shap_values, provenance), - ProblemTypes.TIME_SERIES_BINARY: _BinarySHAPTable(top_k, include_shap_values, class_names, provenance), - ProblemTypes.TIME_SERIES_MULTICLASS: _MultiClassSHAPTable(top_k, include_shap_values, class_names, provenance)} + table_makers = { + ProblemTypes.REGRESSION: _RegressionSHAPTable( + top_k, include_shap_values, provenance + ), + ProblemTypes.BINARY: _BinarySHAPTable( + top_k, include_shap_values, class_names, provenance + ), + ProblemTypes.MULTICLASS: _MultiClassSHAPTable( + top_k, include_shap_values, class_names, provenance + ), + ProblemTypes.TIME_SERIES_REGRESSION: _RegressionSHAPTable( + top_k, include_shap_values, provenance + ), + ProblemTypes.TIME_SERIES_BINARY: _BinarySHAPTable( + top_k, include_shap_values, class_names, provenance + ), + ProblemTypes.TIME_SERIES_MULTICLASS: _MultiClassSHAPTable( + top_k, include_shap_values, class_names, provenance + ), + } table_maker_class = table_makers[pipeline.problem_type] - table_maker = {"text": table_maker_class.make_text, "dict": table_maker_class.make_dict, - "dataframe": table_maker_class.make_dataframe}[output_format] - - return table_maker(aggregated_shap_values, aggregated_normalized_shap_values, - shap_values, normalized_values, pipeline_features_row, input_features_row) + table_maker = { + "text": table_maker_class.make_text, + "dict": table_maker_class.make_dict, + "dataframe": table_maker_class.make_dataframe, + }[output_format] + + return table_maker( + aggregated_shap_values, + aggregated_normalized_shap_values, + shap_values, + normalized_values, + pipeline_features_row, + input_features_row, + ) class _SectionMaker(abc.ABC): @@ -349,7 +551,6 @@ def make_dataframe(self, *args, **kwargs): class _Heading(_SectionMaker): - def __init__(self, prefixes, n_indices): self.prefixes = prefixes self.n_indices = n_indices @@ -407,27 +608,35 @@ def make_text(self, index, y_pred, y_true, scores, dataframe_index): dataframe_index (pd.Series): pandas index for the entire dataset. Used to display the index in the data each explanation belongs to. """ - pred_value = [f"{col_name}: {pred}" for col_name, pred in - zip(y_pred.columns, round(y_pred.iloc[index], 3).tolist())] + pred_value = [ + f"{col_name}: {pred}" + for col_name, pred in zip( + y_pred.columns, round(y_pred.iloc[index], 3).tolist() + ) + ] pred_value = "[" + ", ".join(pred_value) + "]" true_value = y_true.iloc[index] - return [f"\t\tPredicted Probabilities: {pred_value}\n", - f"\t\tPredicted Value: {self.predicted_values[index]}\n", - f"\t\tTarget Value: {true_value}\n", - f"\t\t{self.error_name}: {round(scores[index], 3)}\n", - f"\t\tIndex ID: {dataframe_index.iloc[index]}\n\n"] + return [ + f"\t\tPredicted Probabilities: {pred_value}\n", + f"\t\tPredicted Value: {self.predicted_values[index]}\n", + f"\t\tTarget Value: {true_value}\n", + f"\t\t{self.error_name}: {round(scores[index], 3)}\n", + f"\t\tIndex ID: {dataframe_index.iloc[index]}\n\n", + ] def make_dict(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section for classification problem best/worst reports formatted as dictionary.""" pred_values = dict(zip(y_pred.columns, round(y_pred.iloc[index], 3).tolist())) - return {"probabilities": pred_values, - "predicted_value": _make_json_serializable(self.predicted_values[index]), - "target_value": _make_json_serializable(y_true.iloc[index]), - "error_name": self.error_name, - "error_value": _make_json_serializable(scores[index]), - "index_id": _make_json_serializable(dataframe_index.iloc[index])} + return { + "probabilities": pred_values, + "predicted_value": _make_json_serializable(self.predicted_values[index]), + "target_value": _make_json_serializable(y_true.iloc[index]), + "error_name": self.error_name, + "error_value": _make_json_serializable(scores[index]), + "index_id": _make_json_serializable(dataframe_index.iloc[index]), + } def make_dataframe(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section for classification problem best/worst reports formatted as dataframe.""" @@ -452,17 +661,23 @@ def make_text(self, index, y_pred, y_true, scores, dataframe_index): dataframe_index (pd.Series): pandas index for the entire dataset. Used to display the index in the data each explanation belongs to. """ - return [f"\t\tPredicted Value: {round(y_pred.iloc[index], 3)}\n", - f"\t\tTarget Value: {round(y_true.iloc[index], 3)}\n", - f"\t\t{self.error_name}: {round(scores[index], 3)}\n", - f"\t\tIndex ID: {dataframe_index.iloc[index]}\n\n"] + return [ + f"\t\tPredicted Value: {round(y_pred.iloc[index], 3)}\n", + f"\t\tTarget Value: {round(y_true.iloc[index], 3)}\n", + f"\t\t{self.error_name}: {round(scores[index], 3)}\n", + f"\t\tIndex ID: {dataframe_index.iloc[index]}\n\n", + ] def make_dict(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section for regression problem best/worst reports formatted as a dictionary.""" - return {"probabilities": None, "predicted_value": round(y_pred.iloc[index], 3), - "target_value": round(y_true.iloc[index], 3), "error_name": self.error_name, - "error_value": round(scores[index], 3), - "index_id": _make_json_serializable(dataframe_index.iloc[index])} + return { + "probabilities": None, + "predicted_value": round(y_pred.iloc[index], 3), + "target_value": round(y_true.iloc[index], 3), + "error_name": self.error_name, + "error_value": round(scores[index], 3), + "index_id": _make_json_serializable(dataframe_index.iloc[index]), + } def make_dataframe(self, index, y_pred, y_true, scores, dataframe_index): """Makes the predicted values section formatted as a dataframe.""" @@ -491,32 +706,43 @@ def make_text(self, index, pipeline, pipeline_features, input_features): pipeline_features (pd.DataFrame): The dataframe of features created by the pipeline. input_features (pd.Dataframe): The dataframe of features passed to the pipeline. """ - table = _make_single_prediction_shap_table(pipeline, pipeline_features, - input_features, - index_to_explain=index, - top_k=self.top_k_features, - include_shap_values=self.include_shap_values, output_format="text") + table = _make_single_prediction_shap_table( + pipeline, + pipeline_features, + input_features, + index_to_explain=index, + top_k=self.top_k_features, + include_shap_values=self.include_shap_values, + output_format="text", + ) table = table.splitlines() # Indent the rows of the table to match the indentation of the entire report. return ["\t\t" + line + "\n" for line in table] + ["\n\n"] def make_dict(self, index, pipeline, pipeline_features, input_features): """Makes the SHAP table section formatted as a dictionary.""" - json_output = _make_single_prediction_shap_table(pipeline, pipeline_features, input_features, - index_to_explain=index, - top_k=self.top_k_features, - include_shap_values=self.include_shap_values, - output_format="dict") + json_output = _make_single_prediction_shap_table( + pipeline, + pipeline_features, + input_features, + index_to_explain=index, + top_k=self.top_k_features, + include_shap_values=self.include_shap_values, + output_format="dict", + ) return json_output def make_dataframe(self, index, pipeline, pipeline_features, input_features): """Makes the SHAP table section formatted as a dataframe.""" - return _make_single_prediction_shap_table(pipeline, pipeline_features, - input_features, - index_to_explain=index, - top_k=self.top_k_features, - include_shap_values=self.include_shap_values, - output_format="dataframe") + return _make_single_prediction_shap_table( + pipeline, + pipeline_features, + input_features, + index_to_explain=index, + top_k=self.top_k_features, + include_shap_values=self.include_shap_values, + output_format="dataframe", + ) class _ReportMaker: @@ -553,13 +779,22 @@ def make_text(self, data): for rank, index in enumerate(data.index_list): report.extend(self.heading_maker.make_text(rank)) if self.make_predicted_values_maker: - report.extend(self.make_predicted_values_maker.make_text(index, data.y_pred, data.y_true, - data.errors, - pd.Series(data.pipeline_features.index))) + report.extend( + self.make_predicted_values_maker.make_text( + index, + data.y_pred, + data.y_true, + data.errors, + pd.Series(data.pipeline_features.index), + ) + ) else: report.extend([""]) - report.extend(self.table_maker.make_text(index, data.pipeline, data.pipeline_features, - data.input_features)) + report.extend( + self.table_maker.make_text( + index, data.pipeline, data.pipeline_features, data.input_features + ) + ) return "".join(report) def make_dict(self, data): @@ -578,35 +813,47 @@ def make_dict(self, data): if self.heading_maker: section["rank"] = self.heading_maker.make_dict(rank) if self.make_predicted_values_maker: - section["predicted_values"] = self.make_predicted_values_maker.make_dict(index, data.y_pred, - data.y_true, data.errors, - pd.Series(data.pipeline_features.index)) - section["explanations"] = self.table_maker.make_dict(index, data.pipeline, - data.pipeline_features, - data.input_features)["explanations"] + section[ + "predicted_values" + ] = self.make_predicted_values_maker.make_dict( + index, + data.y_pred, + data.y_true, + data.errors, + pd.Series(data.pipeline_features.index), + ) + section["explanations"] = self.table_maker.make_dict( + index, data.pipeline, data.pipeline_features, data.input_features + )["explanations"] report.append(section) return {"explanations": report} def make_dataframe(self, data): report = [] for rank, index in enumerate(data.index_list): - shap_table = self.table_maker.make_dataframe(index, data.pipeline, data.pipeline_features, - data.input_features) + shap_table = self.table_maker.make_dataframe( + index, data.pipeline, data.pipeline_features, data.input_features + ) if self.make_predicted_values_maker: - heading = self.make_predicted_values_maker.make_dataframe(index, data.y_pred, data.y_true, data.errors, - pd.Series(data.pipeline_features.index)) + heading = self.make_predicted_values_maker.make_dataframe( + index, + data.y_pred, + data.y_true, + data.errors, + pd.Series(data.pipeline_features.index), + ) for key, value in heading.items(): if key == "probabilities": for class_name, probability in value.items(): - shap_table[f'label_{class_name}_probability'] = probability + shap_table[f"label_{class_name}_probability"] = probability else: shap_table[key] = value if self.heading_maker: heading = self.heading_maker.make_dataframe(rank) - shap_table['rank'] = heading['index'] - shap_table['prefix'] = heading['prefix'] + shap_table["rank"] = heading["index"] + shap_table["prefix"] = heading["prefix"] else: - shap_table['prediction_number'] = rank + shap_table["prediction_number"] = rank report.append(shap_table) df = pd.concat(report).reset_index(drop=True) diff --git a/evalml/model_understanding/prediction_explanations/explainers.py b/evalml/model_understanding/prediction_explanations/explainers.py index ffbd4df8bc..c1bb8ed0b1 100644 --- a/evalml/model_understanding/prediction_explanations/explainers.py +++ b/evalml/model_understanding/prediction_explanations/explainers.py @@ -10,19 +10,38 @@ from evalml.exceptions import PipelineScoreError from evalml.model_family import ModelFamily from evalml.model_understanding.prediction_explanations._report_creator_factory import ( - _report_creator_factory + _report_creator_factory, ) from evalml.problem_types import ProblemTypes, is_regression, is_time_series from evalml.utils import infer_feature_types from evalml.utils.gen_utils import drop_rows_with_nans # Container for all of the pipeline-related data we need to create reports. Helps standardize APIs of report makers. -_ReportData = namedtuple("ReportData", ["pipeline", "pipeline_features", "input_features", - "y_true", "y_pred", "y_pred_values", "errors", "index_list", "metric"]) +_ReportData = namedtuple( + "ReportData", + [ + "pipeline", + "pipeline_features", + "input_features", + "y_true", + "y_pred", + "y_pred_values", + "errors", + "index_list", + "metric", + ], +) -def explain_predictions(pipeline, input_features, y, indices_to_explain, top_k_features=3, include_shap_values=False, - output_format="text"): +def explain_predictions( + pipeline, + input_features, + y, + indices_to_explain, + top_k_features=3, + include_shap_values=False, + output_format="text", +): """Creates a report summarizing the top contributing features for each data point in the input features. XGBoost and Stacked Ensemble models, as well as CatBoost multiclass classifiers, are not currently supported. @@ -53,18 +72,35 @@ def explain_predictions(pipeline, input_features, y, indices_to_explain, top_k_f if input_features.empty: raise ValueError("Parameter input_features must be a non-empty dataframe.") if output_format not in {"text", "dict", "dataframe"}: - raise ValueError(f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}") + raise ValueError( + f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}" + ) if any([x < 0 or x >= len(input_features) for x in indices_to_explain]): - raise ValueError(f"Explained indices should be between 0 and {len(input_features) - 1}") + raise ValueError( + f"Explained indices should be between 0 and {len(input_features) - 1}" + ) pipeline_features = pipeline.compute_estimator_features(input_features, y) - data = _ReportData(pipeline, pipeline_features, input_features, y_true=y, y_pred=None, - y_pred_values=None, errors=None, index_list=indices_to_explain, metric=None) - - report_creator = _report_creator_factory(data, report_type="explain_predictions", - output_format=output_format, top_k_features=top_k_features, - include_shap_values=include_shap_values) + data = _ReportData( + pipeline, + pipeline_features, + input_features, + y_true=y, + y_pred=None, + y_pred_values=None, + errors=None, + index_list=indices_to_explain, + metric=None, + ) + + report_creator = _report_creator_factory( + data, + report_type="explain_predictions", + output_format=output_format, + top_k_features=top_k_features, + include_shap_values=include_shap_values, + ) return report_creator(data) @@ -81,15 +117,24 @@ def _update_progress(start_time, current_time, progress_stage, callback_function class ExplainPredictionsStage(Enum): - PREPROCESSING_STAGE = "preprocessing_stage", - PREDICT_STAGE = "predict_stage", - COMPUTE_FEATURE_STAGE = "compute_feature_stage", - COMPUTE_SHAP_VALUES_STAGE = "compute_shap_value_stage", + PREPROCESSING_STAGE = ("preprocessing_stage",) + PREDICT_STAGE = ("predict_stage",) + COMPUTE_FEATURE_STAGE = ("compute_feature_stage",) + COMPUTE_SHAP_VALUES_STAGE = ("compute_shap_value_stage",) DONE = "done" -def explain_predictions_best_worst(pipeline, input_features, y_true, num_to_explain=5, top_k_features=3, - include_shap_values=False, metric=None, output_format="text", callback=None): +def explain_predictions_best_worst( + pipeline, + input_features, + y_true, + num_to_explain=5, + top_k_features=3, + include_shap_values=False, + metric=None, + output_format="text", + callback=None, +): """Creates a report summarizing the top contributing features for the best and worst points in the dataset as measured by error to true labels. XGBoost and Stacked Ensemble models, as well as CatBoost multiclass classifiers, are not currently supported. @@ -121,25 +166,35 @@ def explain_predictions_best_worst(pipeline, input_features, y_true, num_to_expl ValueError: if an output_format outside of "text", "dict" or "dataframe is provided. """ start_time = timer() - _update_progress(start_time, timer(), ExplainPredictionsStage.PREPROCESSING_STAGE, callback) + _update_progress( + start_time, timer(), ExplainPredictionsStage.PREPROCESSING_STAGE, callback + ) input_features = infer_feature_types(input_features) y_true = infer_feature_types(y_true) if not (input_features.shape[0] >= num_to_explain * 2): - raise ValueError(f"Input features must be a dataframe with more than {num_to_explain * 2} rows! " - "Convert to a dataframe and select a smaller value for num_to_explain if you do not have " - "enough data.") + raise ValueError( + f"Input features must be a dataframe with more than {num_to_explain * 2} rows! " + "Convert to a dataframe and select a smaller value for num_to_explain if you do not have " + "enough data." + ) if y_true.shape[0] != input_features.shape[0]: - raise ValueError("Parameters y_true and input_features must have the same number of data points. Received: " - f"true labels: {y_true.shape[0]} and {input_features.shape[0]}") + raise ValueError( + "Parameters y_true and input_features must have the same number of data points. Received: " + f"true labels: {y_true.shape[0]} and {input_features.shape[0]}" + ) if output_format not in {"text", "dict", "dataframe"}: - raise ValueError(f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}") + raise ValueError( + f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}" + ) if pipeline.model_family == ModelFamily.ENSEMBLE: raise ValueError("Cannot explain predictions for a stacked ensemble pipeline") if not metric: metric = DEFAULT_METRICS[pipeline.problem_type] - _update_progress(start_time, timer(), ExplainPredictionsStage.PREDICT_STAGE, callback) + _update_progress( + start_time, timer(), ExplainPredictionsStage.PREDICT_STAGE, callback + ) try: if is_regression(pipeline.problem_type): @@ -157,28 +212,51 @@ def explain_predictions_best_worst(pipeline, input_features, y_true, num_to_expl else: y_pred = pipeline.predict_proba(input_features) y_pred_values = pipeline.predict(input_features) - y_true_no_nan, y_pred_no_nan, y_pred_values_no_nan = drop_rows_with_nans(y_true, y_pred, y_pred_values) + y_true_no_nan, y_pred_no_nan, y_pred_values_no_nan = drop_rows_with_nans( + y_true, y_pred, y_pred_values + ) errors = metric(pipeline._encode_targets(y_true_no_nan), y_pred_no_nan) except Exception as e: tb = traceback.format_tb(sys.exc_info()[2]) - raise PipelineScoreError(exceptions={metric.__name__: (e, tb)}, scored_successfully={}) + raise PipelineScoreError( + exceptions={metric.__name__: (e, tb)}, scored_successfully={} + ) errors = pd.Series(errors, index=y_pred_no_nan.index) sorted_scores = errors.sort_values() best_indices = sorted_scores.index[:num_to_explain] worst_indices = sorted_scores.index[-num_to_explain:] index_list = best_indices.tolist() + worst_indices.tolist() - _update_progress(start_time, timer(), ExplainPredictionsStage.COMPUTE_FEATURE_STAGE, callback) + _update_progress( + start_time, timer(), ExplainPredictionsStage.COMPUTE_FEATURE_STAGE, callback + ) pipeline_features = pipeline.compute_estimator_features(input_features, y_true) - _update_progress(start_time, timer(), ExplainPredictionsStage.COMPUTE_SHAP_VALUES_STAGE, callback) - - data = _ReportData(pipeline, pipeline_features, input_features, y_true, y_pred, y_pred_values, errors, index_list, metric) - - report_creator = _report_creator_factory(data, report_type="explain_predictions_best_worst", - output_format=output_format, top_k_features=top_k_features, - include_shap_values=include_shap_values, num_to_explain=num_to_explain) + _update_progress( + start_time, timer(), ExplainPredictionsStage.COMPUTE_SHAP_VALUES_STAGE, callback + ) + + data = _ReportData( + pipeline, + pipeline_features, + input_features, + y_true, + y_pred, + y_pred_values, + errors, + index_list, + metric, + ) + + report_creator = _report_creator_factory( + data, + report_type="explain_predictions_best_worst", + output_format=output_format, + top_k_features=top_k_features, + include_shap_values=include_shap_values, + num_to_explain=num_to_explain, + ) _update_progress(start_time, timer(), ExplainPredictionsStage.DONE, callback) @@ -209,13 +287,17 @@ def cross_entropy(y_true, y_pred_proba): np.ndarray """ n_data_points = y_pred_proba.shape[0] - log_likelihood = -np.log(y_pred_proba.values[range(n_data_points), y_true.values.astype("int")]) + log_likelihood = -np.log( + y_pred_proba.values[range(n_data_points), y_true.values.astype("int")] + ) return log_likelihood -DEFAULT_METRICS = {ProblemTypes.BINARY: cross_entropy, - ProblemTypes.MULTICLASS: cross_entropy, - ProblemTypes.REGRESSION: abs_error, - ProblemTypes.TIME_SERIES_BINARY: cross_entropy, - ProblemTypes.TIME_SERIES_MULTICLASS: cross_entropy, - ProblemTypes.TIME_SERIES_REGRESSION: abs_error} +DEFAULT_METRICS = { + ProblemTypes.BINARY: cross_entropy, + ProblemTypes.MULTICLASS: cross_entropy, + ProblemTypes.REGRESSION: abs_error, + ProblemTypes.TIME_SERIES_BINARY: cross_entropy, + ProblemTypes.TIME_SERIES_MULTICLASS: cross_entropy, + ProblemTypes.TIME_SERIES_REGRESSION: abs_error, +} diff --git a/evalml/objectives/__init__.py b/evalml/objectives/__init__.py index 9c161832ee..f805539497 100644 --- a/evalml/objectives/__init__.py +++ b/evalml/objectives/__init__.py @@ -3,9 +3,7 @@ from .fraud_cost import FraudCost from .lead_scoring import LeadScoring from .sensitivity_low_alert import SensitivityLowAlert -from .multiclass_classification_objective import ( - MulticlassClassificationObjective -) +from .multiclass_classification_objective import MulticlassClassificationObjective from .objective_base import ObjectiveBase from .regression_objective import RegressionObjective from .standard_metrics import ( @@ -42,7 +40,12 @@ Recall, RecallMacro, RecallMicro, - RecallWeighted + RecallWeighted, +) +from .utils import ( + get_objective, + get_core_objectives, + get_all_objective_names, + get_non_core_objectives, + get_core_objective_names, ) -from .utils import get_objective, get_core_objectives, get_all_objective_names, get_non_core_objectives,\ - get_core_objective_names diff --git a/evalml/objectives/binary_classification_objective.py b/evalml/objectives/binary_classification_objective.py index 59214201d3..034e62ca93 100644 --- a/evalml/objectives/binary_classification_objective.py +++ b/evalml/objectives/binary_classification_objective.py @@ -14,7 +14,7 @@ class BinaryClassificationObjective(ObjectiveBase): @property def can_optimize_threshold(cls): """Returns a boolean determining if we can optimize the binary classification objective threshold. - This will be false for any objective that works directly with predicted probabilities, like log loss and AUC. Otherwise, it will be true.""" + This will be false for any objective that works directly with predicted probabilities, like log loss and AUC. Otherwise, it will be true.""" return not cls.score_needs_proba def optimize_threshold(self, ypred_proba, y_true, X=None): @@ -37,11 +37,13 @@ def optimize_threshold(self, ypred_proba, y_true, X=None): raise RuntimeError("Trying to optimize objective that can't be optimized!") def cost(threshold): - y_predicted = self.decision_function(ypred_proba=ypred_proba, threshold=threshold, X=X) + y_predicted = self.decision_function( + ypred_proba=ypred_proba, threshold=threshold, X=X + ) cost = self.objective_function(y_true, y_predicted, X=X) return -cost if self.greater_is_better else cost - optimal = minimize_scalar(cost, method='Golden', options={"maxiter": 100}) + optimal = minimize_scalar(cost, method="Golden", options={"maxiter": 100}) return optimal.x def decision_function(self, ypred_proba, threshold=0.5, X=None): diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py index 8bea92a249..08b6f0af96 100644 --- a/evalml/objectives/cost_benefit_matrix.py +++ b/evalml/objectives/cost_benefit_matrix.py @@ -1,4 +1,3 @@ - import numpy as np from .binary_classification_objective import BinaryClassificationObjective @@ -8,8 +7,9 @@ class CostBenefitMatrix(BinaryClassificationObjective): """Score using a cost-benefit matrix. Scores quantify the benefits of a given value, so greater numeric - scores represents a better score. Costs and scores can be negative, indicating that a value is not beneficial. - For example, in the case of monetary profit, a negative cost and/or score represents loss of cash flow.""" + scores represents a better score. Costs and scores can be negative, indicating that a value is not beneficial. + For example, in the case of monetary profit, a negative cost and/or score represents loss of cash flow.""" + name = "Cost Benefit Matrix" greater_is_better = True score_needs_proba = False @@ -26,7 +26,9 @@ def __init__(self, true_positive, true_negative, false_positive, false_negative) false_negative (float): Cost associated with false negative predictions """ if None in {true_positive, true_negative, false_positive, false_negative}: - raise ValueError("Parameters to CostBenefitMatrix must all be numeric values.") + raise ValueError( + "Parameters to CostBenefitMatrix must all be numeric values." + ) self.true_positive = true_positive self.true_negative = true_negative @@ -44,9 +46,13 @@ def objective_function(self, y_true, y_predicted, X=None): Returns: float: Cost-benefit matrix score """ - conf_matrix = confusion_matrix(y_true, y_predicted, normalize_method='all') - cost_matrix = np.array([[self.true_negative, self.false_positive], - [self.false_negative, self.true_positive]]) + conf_matrix = confusion_matrix(y_true, y_predicted, normalize_method="all") + cost_matrix = np.array( + [ + [self.true_negative, self.false_positive], + [self.false_negative, self.true_positive], + ] + ) total_cost = np.multiply(conf_matrix.values, cost_matrix).sum() return total_cost diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py index 59f70b0132..e81b453aaa 100644 --- a/evalml/objectives/fraud_cost.py +++ b/evalml/objectives/fraud_cost.py @@ -1,17 +1,22 @@ - from .binary_classification_objective import BinaryClassificationObjective class FraudCost(BinaryClassificationObjective): """Score the percentage of money lost of the total transaction amount process due to fraud.""" + name = "Fraud Cost" greater_is_better = False score_needs_proba = False perfect_score = 0.0 is_bounded_like_percentage = True - def __init__(self, retry_percentage=.5, interchange_fee=.02, - fraud_payout_percentage=1.0, amount_col='amount'): + def __init__( + self, + retry_percentage=0.5, + interchange_fee=0.02, + fraud_payout_percentage=1.0, + amount_col="amount", + ): """Create instance of FraudCost Arguments: @@ -45,7 +50,7 @@ def decision_function(self, ypred_proba, threshold=0.0, X=None): if X is not None: X = self._standardize_input_type(X) ypred_proba = self._standardize_input_type(ypred_proba) - transformed_probs = (ypred_proba.values * X[self.amount_col]) + transformed_probs = ypred_proba.values * X[self.amount_col] return transformed_probs > threshold def objective_function(self, y_true, y_predicted, X): @@ -74,7 +79,9 @@ def objective_function(self, y_true, y_predicted, X): fraud_cost = transaction_amount * self.fraud_payout_percentage # money made from interchange fees on transaction - interchange_cost = transaction_amount * (1 - self.retry_percentage) * self.interchange_fee + interchange_cost = ( + transaction_amount * (1 - self.retry_percentage) * self.interchange_fee + ) # calculate cost of missing fraudulent transactions false_negatives = (y_true & ~y_predicted) * fraud_cost diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py index d082939b56..560a2d4677 100644 --- a/evalml/objectives/lead_scoring.py +++ b/evalml/objectives/lead_scoring.py @@ -5,6 +5,7 @@ class LeadScoring(BinaryClassificationObjective): """Lead scoring.""" + name = "Lead Scoring" greater_is_better = True score_needs_proba = False @@ -24,13 +25,13 @@ def __init__(self, true_positives=1, false_positives=-1): def objective_function(self, y_true, y_predicted, X=None): """Calculate the profit per lead. - Arguments: - y_predicted (pd.Series): Predicted labels - y_true (pd.Series): True labels - X (pd.DataFrame): Ignored. + Arguments: + y_predicted (pd.Series): Predicted labels + y_true (pd.Series): True labels + X (pd.DataFrame): Ignored. - Returns: - float: Profit per lead + Returns: + float: Profit per lead """ y_true = self._standardize_input_type(y_true) y_predicted = self._standardize_input_type(y_predicted) diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py index 4fac837e7d..c95d6fb226 100644 --- a/evalml/objectives/objective_base.py +++ b/evalml/objectives/objective_base.py @@ -28,8 +28,7 @@ def greater_is_better(cls): @classmethod @abstractmethod def score_needs_proba(cls): - """Returns a boolean determining if the score() method needs probability estimates. This should be true for objectives which work with predicted probabilities, like log loss or AUC, and false for objectives which compare predicted class labels to the actual labels, like F1 or correlation. - """ + """Returns a boolean determining if the score() method needs probability estimates. This should be true for objectives which work with predicted probabilities, like log loss or AUC, and false for objectives which compare predicted class labels to the actual labels, like F1 or correlation.""" @property @classmethod @@ -111,7 +110,11 @@ def validate_inputs(self, y_true, y_predicted): None """ if y_predicted.shape[0] != y_true.shape[0]: - raise ValueError("Inputs have mismatched dimensions: y_predicted has shape {}, y_true has shape {}".format(len(y_predicted), len(y_true))) + raise ValueError( + "Inputs have mismatched dimensions: y_predicted has shape {}, y_true has shape {}".format( + len(y_predicted), len(y_true) + ) + ) if len(y_true) == 0: raise ValueError("Length of inputs is 0") if np.isnan(y_true).any() or np.isinf(y_true).any(): @@ -121,7 +124,9 @@ def validate_inputs(self, y_true, y_predicted): if np.isnan(y_pred_flat).any() or np.isinf(y_pred_flat).any(): raise ValueError("y_predicted contains NaN or infinity") if self.score_needs_proba and np.any([(y_pred_flat < 0) | (y_pred_flat > 1)]): - raise ValueError("y_predicted contains probability estimates not within [0, 1]") + raise ValueError( + "y_predicted contains probability estimates not within [0, 1]" + ) @classmethod def calculate_percent_difference(cls, score, baseline_score): @@ -145,15 +150,24 @@ def calculate_percent_difference(cls, score, baseline_score): return 0 # Return inf when dividing by 0 - if np.isclose(baseline_score, 0, atol=1e-10) and not cls.is_bounded_like_percentage: + if ( + np.isclose(baseline_score, 0, atol=1e-10) + and not cls.is_bounded_like_percentage + ): return np.inf decrease = False - if (baseline_score > score and cls.greater_is_better) or (baseline_score < score and not cls.greater_is_better): + if (baseline_score > score and cls.greater_is_better) or ( + baseline_score < score and not cls.greater_is_better + ): decrease = True - difference = (baseline_score - score) - change = difference if cls.is_bounded_like_percentage else difference / baseline_score + difference = baseline_score - score + change = ( + difference + if cls.is_bounded_like_percentage + else difference / baseline_score + ) return 100 * (-1) ** (decrease) * np.abs(change) @classmethod diff --git a/evalml/objectives/sensitivity_low_alert.py b/evalml/objectives/sensitivity_low_alert.py index 376256061e..0b42b08faa 100644 --- a/evalml/objectives/sensitivity_low_alert.py +++ b/evalml/objectives/sensitivity_low_alert.py @@ -35,7 +35,9 @@ def decision_function(self, ypred_proba, **kwargs): ypred_proba = self._standardize_input_type(ypred_proba) if len(ypred_proba.unique()) == 1: - logger.debug(f"All predicted probabilities have the same value: {ypred_proba.unique()}") + logger.debug( + f"All predicted probabilities have the same value: {ypred_proba.unique()}" + ) prob_thresh = np.quantile(ypred_proba, 1 - self.alert_rate) if (prob_thresh == 0) or (prob_thresh == 1): diff --git a/evalml/objectives/standard_metrics.py b/evalml/objectives/standard_metrics.py index c9834f70e0..c15808be3a 100644 --- a/evalml/objectives/standard_metrics.py +++ b/evalml/objectives/standard_metrics.py @@ -7,15 +7,14 @@ from ..utils import classproperty from .binary_classification_objective import BinaryClassificationObjective -from .multiclass_classification_objective import ( - MulticlassClassificationObjective -) +from .multiclass_classification_objective import MulticlassClassificationObjective from .regression_objective import RegressionObjective from .time_series_regression_objective import TimeSeriesRegressionObjective class AccuracyBinary(BinaryClassificationObjective): """Accuracy score for binary classification.""" + name = "Accuracy Binary" greater_is_better = True score_needs_proba = False @@ -28,6 +27,7 @@ def objective_function(self, y_true, y_predicted, X=None): class AccuracyMulticlass(MulticlassClassificationObjective): """Accuracy score for multiclass classification.""" + name = "Accuracy Multiclass" greater_is_better = True score_needs_proba = False @@ -40,6 +40,7 @@ def objective_function(self, y_true, y_predicted, X=None): class BalancedAccuracyBinary(BinaryClassificationObjective): """Balanced accuracy score for binary classification.""" + name = "Balanced Accuracy Binary" greater_is_better = True score_needs_proba = False @@ -52,6 +53,7 @@ def objective_function(self, y_true, y_predicted, X=None): class BalancedAccuracyMulticlass(MulticlassClassificationObjective): """Balanced accuracy score for multiclass classification.""" + name = "Balanced Accuracy Multiclass" greater_is_better = True score_needs_proba = False @@ -64,6 +66,7 @@ def objective_function(self, y_true, y_predicted, X=None): class F1(BinaryClassificationObjective): """F1 score for binary classification.""" + name = "F1" greater_is_better = True score_needs_proba = False @@ -76,6 +79,7 @@ def objective_function(self, y_true, y_predicted, X=None): class F1Micro(MulticlassClassificationObjective): """F1 score for multiclass classification using micro averaging.""" + name = "F1 Micro" greater_is_better = True score_needs_proba = False @@ -83,11 +87,12 @@ class F1Micro(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.f1_score(y_true, y_predicted, average='micro', zero_division=0.0) + return metrics.f1_score(y_true, y_predicted, average="micro", zero_division=0.0) class F1Macro(MulticlassClassificationObjective): """F1 score for multiclass classification using macro averaging.""" + name = "F1 Macro" greater_is_better = True score_needs_proba = False @@ -95,11 +100,12 @@ class F1Macro(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.f1_score(y_true, y_predicted, average='macro', zero_division=0.0) + return metrics.f1_score(y_true, y_predicted, average="macro", zero_division=0.0) class F1Weighted(MulticlassClassificationObjective): """F1 score for multiclass classification using weighted averaging.""" + name = "F1 Weighted" greater_is_better = True score_needs_proba = False @@ -107,11 +113,14 @@ class F1Weighted(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.f1_score(y_true, y_predicted, average='weighted', zero_division=0.0) + return metrics.f1_score( + y_true, y_predicted, average="weighted", zero_division=0.0 + ) class Precision(BinaryClassificationObjective): """Precision score for binary classification.""" + name = "Precision" greater_is_better = True score_needs_proba = False @@ -124,6 +133,7 @@ def objective_function(self, y_true, y_predicted, X=None): class PrecisionMicro(MulticlassClassificationObjective): """Precision score for multiclass classification using micro averaging.""" + name = "Precision Micro" greater_is_better = True score_needs_proba = False @@ -131,11 +141,14 @@ class PrecisionMicro(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.precision_score(y_true, y_predicted, average='micro', zero_division=0.0) + return metrics.precision_score( + y_true, y_predicted, average="micro", zero_division=0.0 + ) class PrecisionMacro(MulticlassClassificationObjective): """Precision score for multiclass classification using macro averaging.""" + name = "Precision Macro" greater_is_better = True score_needs_proba = False @@ -143,11 +156,14 @@ class PrecisionMacro(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.precision_score(y_true, y_predicted, average='macro', zero_division=0.0) + return metrics.precision_score( + y_true, y_predicted, average="macro", zero_division=0.0 + ) class PrecisionWeighted(MulticlassClassificationObjective): """Precision score for multiclass classification using weighted averaging.""" + name = "Precision Weighted" greater_is_better = True score_needs_proba = False @@ -155,11 +171,14 @@ class PrecisionWeighted(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.precision_score(y_true, y_predicted, average='weighted', zero_division=0.0) + return metrics.precision_score( + y_true, y_predicted, average="weighted", zero_division=0.0 + ) class Recall(BinaryClassificationObjective): """Recall score for binary classification.""" + name = "Recall" greater_is_better = True score_needs_proba = False @@ -172,6 +191,7 @@ def objective_function(self, y_true, y_predicted, X=None): class RecallMicro(MulticlassClassificationObjective): """Recall score for multiclass classification using micro averaging.""" + name = "Recall Micro" greater_is_better = True score_needs_proba = False @@ -179,11 +199,14 @@ class RecallMicro(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.recall_score(y_true, y_predicted, average='micro', zero_division=0.0) + return metrics.recall_score( + y_true, y_predicted, average="micro", zero_division=0.0 + ) class RecallMacro(MulticlassClassificationObjective): """Recall score for multiclass classification using macro averaging.""" + name = "Recall Macro" greater_is_better = True score_needs_proba = False @@ -191,11 +214,14 @@ class RecallMacro(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.recall_score(y_true, y_predicted, average='macro', zero_division=0.0) + return metrics.recall_score( + y_true, y_predicted, average="macro", zero_division=0.0 + ) class RecallWeighted(MulticlassClassificationObjective): """Recall score for multiclass classification using weighted averaging.""" + name = "Recall Weighted" greater_is_better = True score_needs_proba = False @@ -203,11 +229,14 @@ class RecallWeighted(MulticlassClassificationObjective): is_bounded_like_percentage = True def objective_function(self, y_true, y_predicted, X=None): - return metrics.recall_score(y_true, y_predicted, average='weighted', zero_division=0.0) + return metrics.recall_score( + y_true, y_predicted, average="weighted", zero_division=0.0 + ) class AUC(BinaryClassificationObjective): """AUC score for binary classification.""" + name = "AUC" greater_is_better = True score_needs_proba = True @@ -220,6 +249,7 @@ def objective_function(self, y_true, y_predicted, X=None): class AUCMicro(MulticlassClassificationObjective): """AUC score for multiclass classification using micro averaging.""" + name = "AUC Micro" greater_is_better = True score_needs_proba = True @@ -228,11 +258,12 @@ class AUCMicro(MulticlassClassificationObjective): def objective_function(self, y_true, y_predicted, X=None): y_true, y_predicted = _handle_predictions(y_true, y_predicted) - return metrics.roc_auc_score(y_true, y_predicted, average='micro') + return metrics.roc_auc_score(y_true, y_predicted, average="micro") class AUCMacro(MulticlassClassificationObjective): """AUC score for multiclass classification using macro averaging.""" + name = "AUC Macro" greater_is_better = True score_needs_proba = True @@ -241,11 +272,12 @@ class AUCMacro(MulticlassClassificationObjective): def objective_function(self, y_true, y_predicted, X=None): y_true, y_predicted = _handle_predictions(y_true, y_predicted) - return metrics.roc_auc_score(y_true, y_predicted, average='macro') + return metrics.roc_auc_score(y_true, y_predicted, average="macro") class AUCWeighted(MulticlassClassificationObjective): """AUC Score for multiclass classification using weighted averaging.""" + name = "AUC Weighted" greater_is_better = True score_needs_proba = True @@ -254,11 +286,12 @@ class AUCWeighted(MulticlassClassificationObjective): def objective_function(self, y_true, y_predicted, X=None): y_true, y_predicted = _handle_predictions(y_true, y_predicted) - return metrics.roc_auc_score(y_true, y_predicted, average='weighted') + return metrics.roc_auc_score(y_true, y_predicted, average="weighted") class LogLossBinary(BinaryClassificationObjective): """Log Loss for binary classification.""" + name = "Log Loss Binary" greater_is_better = False score_needs_proba = True @@ -271,6 +304,7 @@ def objective_function(self, y_true, y_predicted, X=None): class LogLossMulticlass(MulticlassClassificationObjective): """Log Loss for multiclass classification.""" + name = "Log Loss Multiclass" greater_is_better = False score_needs_proba = True @@ -283,6 +317,7 @@ def objective_function(self, y_true, y_predicted, X=None): class MCCBinary(BinaryClassificationObjective): """Matthews correlation coefficient for binary classification.""" + name = "MCC Binary" greater_is_better = True score_needs_proba = False @@ -292,12 +327,13 @@ class MCCBinary(BinaryClassificationObjective): def objective_function(self, y_true, y_predicted, X=None): with warnings.catch_warnings(): # catches runtime warning when dividing by 0.0 - warnings.simplefilter('ignore', RuntimeWarning) + warnings.simplefilter("ignore", RuntimeWarning) return metrics.matthews_corrcoef(y_true, y_predicted) class MCCMulticlass(MulticlassClassificationObjective): """Matthews correlation coefficient for multiclass classification.""" + name = "MCC Multiclass" greater_is_better = True score_needs_proba = False @@ -307,12 +343,13 @@ class MCCMulticlass(MulticlassClassificationObjective): def objective_function(self, y_true, y_predicted, X=None): with warnings.catch_warnings(): # catches runtime warning when dividing by 0.0 - warnings.simplefilter('ignore', RuntimeWarning) + warnings.simplefilter("ignore", RuntimeWarning) return metrics.matthews_corrcoef(y_true, y_predicted) class RootMeanSquaredError(RegressionObjective): """Root mean squared error for regression.""" + name = "Root Mean Squared Error" greater_is_better = False score_needs_proba = False @@ -328,6 +365,7 @@ class RootMeanSquaredLogError(RegressionObjective): Only valid for nonnegative inputs.Otherwise, will throw a ValueError. """ + name = "Root Mean Squared Log Error" greater_is_better = False score_needs_proba = False @@ -348,6 +386,7 @@ class MeanSquaredLogError(RegressionObjective): Only valid for nonnegative inputs. Otherwise, will throw a ValueError """ + name = "Mean Squared Log Error" greater_is_better = False score_needs_proba = False @@ -365,6 +404,7 @@ def positive_only(self): class R2(RegressionObjective): """Coefficient of determination for regression.""" + name = "R2" greater_is_better = True score_needs_proba = False @@ -377,6 +417,7 @@ def objective_function(self, y_true, y_predicted, X=None): class MAE(RegressionObjective): """Mean absolute error for regression.""" + name = "MAE" greater_is_better = False score_needs_proba = False @@ -392,6 +433,7 @@ class MAPE(TimeSeriesRegressionObjective): Only valid for nonzero inputs. Otherwise, will throw a ValueError """ + name = "Mean Absolute Percentage Error" greater_is_better = False score_needs_proba = False @@ -400,8 +442,10 @@ class MAPE(TimeSeriesRegressionObjective): def objective_function(self, y_true, y_predicted, X=None): if (y_true == 0).any(): - raise ValueError("Mean Absolute Percentage Error cannot be used when " - "targets contain the value 0.") + raise ValueError( + "Mean Absolute Percentage Error cannot be used when " + "targets contain the value 0." + ) if isinstance(y_true, pd.Series): y_true = y_true.values if isinstance(y_predicted, pd.Series): @@ -417,6 +461,7 @@ def positive_only(self): class MSE(RegressionObjective): """Mean squared error for regression.""" + name = "MSE" greater_is_better = False score_needs_proba = False @@ -429,6 +474,7 @@ def objective_function(self, y_true, y_predicted, X=None): class MedianAE(RegressionObjective): """Median absolute error for regression.""" + name = "MedianAE" greater_is_better = False score_needs_proba = False @@ -441,6 +487,7 @@ def objective_function(self, y_true, y_predicted, X=None): class MaxError(RegressionObjective): """Maximum residual error for regression.""" + name = "MaxError" greater_is_better = False score_needs_proba = False @@ -453,6 +500,7 @@ def objective_function(self, y_true, y_predicted, X=None): class ExpVariance(RegressionObjective): """Explained variance score for regression.""" + name = "ExpVariance" greater_is_better = True score_needs_proba = False diff --git a/evalml/objectives/utils.py b/evalml/objectives/utils.py index 94bb13581d..4478931e76 100644 --- a/evalml/objectives/utils.py +++ b/evalml/objectives/utils.py @@ -1,4 +1,3 @@ - from .objective_base import ObjectiveBase from evalml import objectives @@ -16,17 +15,25 @@ def get_non_core_objectives(): Returns: List of ObjectiveBase classes """ - return [objectives.CostBenefitMatrix, objectives.FraudCost, objectives.LeadScoring, - objectives.Recall, objectives.RecallMacro, objectives.RecallMicro, - objectives.RecallWeighted, objectives.MAPE, objectives.MeanSquaredLogError, - objectives.RootMeanSquaredLogError] + return [ + objectives.CostBenefitMatrix, + objectives.FraudCost, + objectives.LeadScoring, + objectives.Recall, + objectives.RecallMacro, + objectives.RecallMicro, + objectives.RecallWeighted, + objectives.MAPE, + objectives.MeanSquaredLogError, + objectives.RootMeanSquaredLogError, + ] def _all_objectives_dict(): all_objectives = _get_subclasses(ObjectiveBase) objectives_dict = {} for objective in all_objectives: - if 'evalml.objectives' not in objective.__module__: + if "evalml.objectives" not in objective.__module__: continue objectives_dict[objective.name.lower()] = objective return objectives_dict @@ -50,7 +57,11 @@ def get_core_objective_names(): """ all_objectives = _all_objectives_dict() non_core = get_non_core_objectives() - return [name for name, class_name in all_objectives.items() if class_name not in non_core] + return [ + name + for name, class_name in all_objectives.items() + if class_name not in non_core + ] def get_objective(objective, return_instance=False, **kwargs): @@ -73,11 +84,15 @@ def get_objective(objective, return_instance=False, **kwargs): return objective all_objectives_dict = _all_objectives_dict() if not isinstance(objective, str): - raise TypeError("If parameter objective is not a string, it must be an instance of ObjectiveBase!") + raise TypeError( + "If parameter objective is not a string, it must be an instance of ObjectiveBase!" + ) if objective.lower() not in all_objectives_dict: - raise ObjectiveNotFoundError(f"{objective} is not a valid Objective! " - "Use evalml.objectives.get_all_objective_names()" - "to get a list of all valid objective names. ") + raise ObjectiveNotFoundError( + f"{objective} is not a valid Objective! " + "Use evalml.objectives.get_all_objective_names()" + "to get a list of all valid objective names. " + ) objective_class = all_objectives_dict[objective.lower()] @@ -85,7 +100,9 @@ def get_objective(objective, return_instance=False, **kwargs): try: return objective_class(**kwargs) except TypeError as e: - raise ObjectiveCreationError(f"In get_objective, cannot pass in return_instance=True for {objective} because {str(e)}") + raise ObjectiveCreationError( + f"In get_objective, cannot pass in return_instance=True for {objective} because {str(e)}" + ) return objective_class @@ -103,5 +120,10 @@ def get_core_objectives(problem_type): """ problem_type = handle_problem_types(problem_type) all_objectives_dict = _all_objectives_dict() - objectives = [obj() for obj in all_objectives_dict.values() if obj.is_defined_for_problem_type(problem_type) and obj not in get_non_core_objectives()] + objectives = [ + obj() + for obj in all_objectives_dict.values() + if obj.is_defined_for_problem_type(problem_type) + and obj not in get_non_core_objectives() + ] return objectives diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py index ad1e5fcb5c..f4cb3cea24 100644 --- a/evalml/pipelines/__init__.py +++ b/evalml/pipelines/__init__.py @@ -32,7 +32,7 @@ KNeighborsClassifier, SVMClassifier, SVMRegressor, - ARIMARegressor + ARIMARegressor, ) from .component_graph import ComponentGraph @@ -44,6 +44,6 @@ from .time_series_classification_pipelines import ( TimeSeriesClassificationPipeline, TimeSeriesBinaryClassificationPipeline, - TimeSeriesMulticlassClassificationPipeline + TimeSeriesMulticlassClassificationPipeline, ) from .time_series_regression_pipeline import TimeSeriesRegressionPipeline diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py index d1021ddf87..776664894e 100644 --- a/evalml/pipelines/binary_classification_pipeline.py +++ b/evalml/pipelines/binary_classification_pipeline.py @@ -1,6 +1,4 @@ -from .binary_classification_pipeline_mixin import ( - BinaryClassificationPipelineMixin -) +from .binary_classification_pipeline_mixin import BinaryClassificationPipelineMixin from evalml.objectives import get_objective from evalml.pipelines.classification_pipeline import ClassificationPipeline @@ -8,8 +6,11 @@ from evalml.utils import infer_feature_types -class BinaryClassificationPipeline(BinaryClassificationPipelineMixin, ClassificationPipeline): +class BinaryClassificationPipeline( + BinaryClassificationPipelineMixin, ClassificationPipeline +): """Pipeline subclass for all binary classification pipelines.""" + problem_type = ProblemTypes.BINARY def _predict(self, X, objective=None): @@ -26,7 +27,9 @@ def _predict(self, X, objective=None): if objective is not None: objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(self.problem_type): - raise ValueError("You can only use a binary classification objective to make predictions for a binary classification pipeline.") + raise ValueError( + "You can only use a binary classification objective to make predictions for a binary classification pipeline." + ) if self.threshold is None: return self._component_graph.predict(X) @@ -47,8 +50,7 @@ def predict_proba(self, X): @staticmethod def _score(X, y, predictions, objective): - """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score. - """ + """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score.""" if predictions.ndim > 1: predictions = predictions.iloc[:, 1] return ClassificationPipeline._score(X, y, predictions, objective) diff --git a/evalml/pipelines/binary_classification_pipeline_mixin.py b/evalml/pipelines/binary_classification_pipeline_mixin.py index 8ee277d332..53008ea1a3 100644 --- a/evalml/pipelines/binary_classification_pipeline_mixin.py +++ b/evalml/pipelines/binary_classification_pipeline_mixin.py @@ -1,5 +1,4 @@ - -class BinaryClassificationPipelineMixin(): +class BinaryClassificationPipelineMixin: _threshold = None @property @@ -22,9 +21,13 @@ def _compute_predictions(self, X, y, objectives, time_series=False): y_predicted = None y_predicted_proba = None if any(o.score_needs_proba for o in objectives) or self.threshold is not None: - y_predicted_proba = self.predict_proba(X, y) if time_series else self.predict_proba(X) + y_predicted_proba = ( + self.predict_proba(X, y) if time_series else self.predict_proba(X) + ) if any(not o.score_needs_proba for o in objectives) and self.threshold is None: - y_predicted = self._predict(X, y, pad=True) if time_series else self._predict(X) + y_predicted = ( + self._predict(X, y, pad=True) if time_series else self._predict(X) + ) return y_predicted, y_predicted_proba def _select_y_pred_for_score(self, X, y, y_pred, y_pred_proba, objective): @@ -46,4 +49,6 @@ def optimize_threshold(self, X, y, y_pred_proba, objective): targets = self._encode_targets(y) self.threshold = objective.optimize_threshold(y_pred_proba, targets, X) else: - raise ValueError("Problem type must be binary and objective must be optimizable.") + raise ValueError( + "Problem type must be binary and objective must be optimizable." + ) diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index 111a2e13c9..8df86b0d1c 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -1,4 +1,3 @@ - import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -9,12 +8,16 @@ class ClassificationPipeline(PipelineBase): """Pipeline subclass for all classification pipelines.""" - def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0): + def __init__( + self, component_graph, parameters=None, custom_name=None, random_seed=0 + ): self._encoder = LabelEncoder() - super().__init__(component_graph, - custom_name=custom_name, - parameters=parameters, - random_seed=random_seed) + super().__init__( + component_graph, + custom_name=custom_name, + parameters=parameters, + random_seed=random_seed, + ) def fit(self, X, y): """Build a classification model. For string and categorical targets, classes are sorted @@ -44,16 +47,18 @@ def _encode_targets(self, y): def _decode_targets(self, y): """Converts encoded numerical values to their original target values. - Note: we cast y as ints first to address boolean values that may be returned from - calculating predictions which we would not be able to otherwise transform if we - originally had integer targets.""" + Note: we cast y as ints first to address boolean values that may be returned from + calculating predictions which we would not be able to otherwise transform if we + originally had integer targets.""" return self._encoder.inverse_transform(y.astype(int)) @property def classes_(self): """Gets the class names for the problem.""" if not hasattr(self._encoder, "classes_"): - raise AttributeError("Cannot access class names before fitting the pipeline.") + raise AttributeError( + "Cannot access class names before fitting the pipeline." + ) return self._encoder.classes_ def _predict(self, X, objective=None): @@ -79,7 +84,9 @@ def predict(self, X, objective=None): pd.Series: Estimated labels """ predictions = self._predict(X, objective=objective) - predictions = pd.Series(self._decode_targets(predictions), name=self.input_target_name) + predictions = pd.Series( + self._decode_targets(predictions), name=self.input_target_name + ) return infer_feature_types(predictions) def predict_proba(self, X): @@ -93,7 +100,12 @@ def predict_proba(self, X): """ X = self.compute_estimator_features(X, y=None) proba = self.estimator.predict_proba(X) - proba = proba.ww.rename(columns={col: new_col for col, new_col in zip(proba.columns, self._encoder.classes_)}) + proba = proba.ww.rename( + columns={ + col: new_col + for col, new_col in zip(proba.columns, self._encoder.classes_) + } + ) return infer_feature_types(proba) def score(self, X, y, objectives): @@ -111,14 +123,20 @@ def score(self, X, y, objectives): objectives = self.create_objectives(objectives) y = self._encode_targets(y) y_predicted, y_predicted_proba = self._compute_predictions(X, y, objectives) - return self._score_all_objectives(X, y, y_predicted, y_predicted_proba, objectives) + return self._score_all_objectives( + X, y, y_predicted, y_predicted_proba, objectives + ) def _compute_predictions(self, X, y, objectives, time_series=False): """Compute predictions/probabilities based on objectives.""" y_predicted = None y_predicted_proba = None if any(o.score_needs_proba for o in objectives): - y_predicted_proba = self.predict_proba(X, y) if time_series else self.predict_proba(X) + y_predicted_proba = ( + self.predict_proba(X, y) if time_series else self.predict_proba(X) + ) if any(not o.score_needs_proba for o in objectives): - y_predicted = self._predict(X, y, pad=True) if time_series else self._predict(X) + y_predicted = ( + self._predict(X, y, pad=True) if time_series else self._predict(X) + ) return y_predicted, y_predicted_proba diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 2048e8896d..d229d324b0 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -10,19 +10,21 @@ class ComponentGraph: def __init__(self, component_dict=None, random_seed=0): - """ Initializes a component graph for a pipeline as a directed acyclic graph (DAG). + """Initializes a component graph for a pipeline as a directed acyclic graph (DAG). Example: >>> component_dict = {'imputer': ['Imputer'], 'ohe': ['One Hot Encoder', 'imputer.x'], 'estimator_1': ['Random Forest Classifier', 'ohe.x'], 'estimator_2': ['Decision Tree Classifier', 'ohe.x'], 'final': ['Logistic Regression Classifier', 'estimator_1', 'estimator_2']} >>> component_graph = ComponentGraph(component_dict) - """ + """ self.random_seed = random_seed self.component_dict = component_dict or {} self.component_instances = {} self._is_instantiated = False for component_name, component_info in self.component_dict.items(): if not isinstance(component_info, list): - raise ValueError('All component information should be passed in as a list') + raise ValueError( + "All component information should be passed in as a list" + ) component_class = handle_component_class(component_info[0]) self.component_instances[component_name] = component_class self.compute_order = self.generate_order(self.component_dict) @@ -51,7 +53,7 @@ def linearized_component_graph(cls, components): component_name = component_class.name if component_name in seen: - component_name = f'{component_name}_{idx}' + component_name = f"{component_name}_{idx}" seen.add(component_name) names.append((component_name, component_class)) else: @@ -69,12 +71,16 @@ def from_list(cls, component_list, random_seed=0): """ component_dict = {} previous_component = None - for component_name, component_class in cls.linearized_component_graph(component_list): + for component_name, component_class in cls.linearized_component_graph( + component_list + ): component_dict[component_name] = [component_class] if previous_component is not None: if "sampler" in previous_component: - component_dict[component_name].extend([f"{previous_component}.x", f"{previous_component}.y"]) + component_dict[component_name].extend( + [f"{previous_component}.x", f"{previous_component}.y"] + ) else: component_dict[component_name].append(f"{previous_component}.x") previous_component = component_name @@ -89,7 +95,9 @@ def instantiate(self, parameters): An empty dictionary {} or None implies using all default values for component parameters. """ if self._is_instantiated: - raise ValueError(f"Cannot reinstantiate a component graph that was previously instantiated") + raise ValueError( + f"Cannot reinstantiate a component graph that was previously instantiated" + ) parameters = parameters or {} self._is_instantiated = True @@ -97,10 +105,14 @@ def instantiate(self, parameters): for component_name, component_class in self.component_instances.items(): component_parameters = parameters.get(component_name, {}) try: - new_component = component_class(**component_parameters, random_seed=self.random_seed) + new_component = component_class( + **component_parameters, random_seed=self.random_seed + ) except (ValueError, TypeError) as e: self._is_instantiated = False - err = "Error received when instantiating component {} with the following arguments {}".format(component_name, component_parameters) + err = "Error received when instantiating component {} with the following arguments {}".format( + component_name, component_parameters + ) raise ValueError(err) from e component_instances[component_name] = new_component @@ -159,18 +171,26 @@ def _fit_transform_features_helper(self, needs_fitting, X, y=None): X = infer_feature_types(X) self.input_feature_names.update({self.compute_order[0]: list(X.columns)}) return X - component_outputs = self._compute_features(self.compute_order[:-1], X, y=y, fit=needs_fitting) + component_outputs = self._compute_features( + self.compute_order[:-1], X, y=y, fit=needs_fitting + ) final_component_inputs = [] for parent in self.get_parents(self.compute_order[-1]): - parent_output = component_outputs.get(parent, component_outputs.get(f'{parent}.x')) + parent_output = component_outputs.get( + parent, component_outputs.get(f"{parent}.x") + ) if isinstance(parent_output, pd.Series): parent_output = pd.DataFrame(parent_output, columns=[parent]) parent_output = infer_feature_types(parent_output) if parent_output is not None: final_component_inputs.append(parent_output) - concatted = pd.concat([component_input for component_input in final_component_inputs], axis=1) + concatted = pd.concat( + [component_input for component_input in final_component_inputs], axis=1 + ) if needs_fitting: - self.input_feature_names.update({self.compute_order[-1]: list(concatted.columns)}) + self.input_feature_names.update( + {self.compute_order[-1]: list(concatted.columns)} + ) return infer_feature_types(concatted) def predict(self, X): @@ -186,7 +206,9 @@ def predict(self, X): return infer_feature_types(X) final_component = self.compute_order[-1] outputs = self._compute_features(self.compute_order, X) - return infer_feature_types(outputs.get(final_component, outputs.get(f'{final_component}.x'))) + return infer_feature_types( + outputs.get(final_component, outputs.get(f"{final_component}.x")) + ) def _compute_features(self, component_list, X, y=None, fit=False): """Transforms the data by applying the given components. @@ -209,20 +231,28 @@ def _compute_features(self, component_list, X, y=None, fit=False): for component_name in component_list: component_instance = self.get_component(component_name) if not isinstance(component_instance, ComponentBase): - raise ValueError('All components must be instantiated before fitting or predicting') + raise ValueError( + "All components must be instantiated before fitting or predicting" + ) x_inputs = [] y_input = None for parent_input in self.get_parents(component_name): - if parent_input[-2:] == '.y': + if parent_input[-2:] == ".y": if y_input is not None: - raise ValueError(f'Cannot have multiple `y` parents for a single component {component_name}') + raise ValueError( + f"Cannot have multiple `y` parents for a single component {component_name}" + ) y_input = output_cache[parent_input] else: - parent_x = output_cache.get(parent_input, output_cache.get(f'{parent_input}.x')) + parent_x = output_cache.get( + parent_input, output_cache.get(f"{parent_input}.x") + ) if isinstance(parent_x, pd.Series): parent_x = parent_x.rename(parent_input) x_inputs.append(parent_x) - input_x, input_y = self._consolidate_inputs(x_inputs, y_input, X, most_recent_y) + input_x, input_y = self._consolidate_inputs( + x_inputs, y_input, X, most_recent_y + ) self.input_feature_names.update({component_name: list(input_x.columns)}) if isinstance(component_instance, Transformer): if fit: @@ -240,7 +270,9 @@ def _compute_features(self, component_list, X, y=None, fit=False): else: if fit: component_instance.fit(input_x, input_y) - if not (fit and component_name == self.compute_order[-1]): # Don't call predict on the final component during fit + if not ( + fit and component_name == self.compute_order[-1] + ): # Don't call predict on the final component during fit output = component_instance.predict(input_x) else: output = None @@ -272,29 +304,44 @@ def _get_feature_provenance(self, input_feature_names): # each one starts with an empty set provenance = {col: set([]) for col in input_feature_names} - transformers = filter(lambda c: isinstance(c, Transformer), [self.get_component(c) for c in self.compute_order]) + transformers = filter( + lambda c: isinstance(c, Transformer), + [self.get_component(c) for c in self.compute_order], + ) for component_instance in transformers: component_provenance = component_instance._get_feature_provenance() for component_input, component_output in component_provenance.items(): # Case 1: The transformer created features from one of the original features if component_input in provenance: - provenance[component_input] = provenance[component_input].union(set(component_output)) + provenance[component_input] = provenance[component_input].union( + set(component_output) + ) # Case 2: The transformer created features from a feature created from an original feature. # Add it to the provenance of the original feature it was created from else: for in_feature, out_feature in provenance.items(): if component_input in out_feature: - provenance[in_feature] = out_feature.union(set(component_output)) + provenance[in_feature] = out_feature.union( + set(component_output) + ) # Get rid of features that are not in the dataset the final estimator uses - final_estimator_features = set(self.input_feature_names.get(self.compute_order[-1], [])) + final_estimator_features = set( + self.input_feature_names.get(self.compute_order[-1], []) + ) for feature in provenance: - provenance[feature] = provenance[feature].intersection(final_estimator_features) + provenance[feature] = provenance[feature].intersection( + final_estimator_features + ) # Delete features that weren't used to create other features - return {feature: children for feature, children in provenance.items() if len(children)} + return { + feature: children + for feature, children in provenance.items() + if len(children) + } @staticmethod def _consolidate_inputs(x_inputs, y_input, X, y): @@ -333,7 +380,7 @@ def get_component(self, component_name): try: return self.component_instances[component_name] except KeyError: - raise ValueError(f'Component {component_name} is not in the graph') + raise ValueError(f"Component {component_name} is not in the graph") def get_last_component(self): """Retrieves the component that is computed last in the graph, usually the final estimator. @@ -342,7 +389,7 @@ def get_last_component(self): ComponentBase object """ if len(self.compute_order) == 0: - raise ValueError('Cannot get last component from edgeless graph') + raise ValueError("Cannot get last component from edgeless graph") last_component_name = self.compute_order[-1] return self.get_component(last_component_name) @@ -353,8 +400,14 @@ def get_estimators(self): list: All estimator objects within the graph """ if not isinstance(self.get_last_component(), ComponentBase): - raise ValueError('Cannot get estimators until the component graph is instantiated') - return [component_class for component_class in self.component_instances.values() if isinstance(component_class, Estimator)] + raise ValueError( + "Cannot get estimators until the component graph is instantiated" + ) + return [ + component_class + for component_class in self.component_instances.values() + if isinstance(component_class, Estimator) + ] def get_parents(self, component_name): """Finds the names of all parent nodes of the given component @@ -383,31 +436,39 @@ def graph(self, name=None, graph_format=None): Returns: graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. """ - graphviz = import_or_raise('graphviz', error_msg='Please install graphviz to visualize pipelines.') + graphviz = import_or_raise( + "graphviz", error_msg="Please install graphviz to visualize pipelines." + ) # Try rendering a dummy graph to see if a working backend is installed try: graphviz.Digraph().pipe() except graphviz.backend.ExecutableNotFound: raise RuntimeError( - "To visualize component graphs, a graphviz backend is required.\n" + - "Install the backend using one of the following commands:\n" + - " Mac OS: brew install graphviz\n" + - " Linux (Ubuntu): sudo apt-get install graphviz\n" + - " Windows: conda install python-graphviz\n" + "To visualize component graphs, a graphviz backend is required.\n" + + "Install the backend using one of the following commands:\n" + + " Mac OS: brew install graphviz\n" + + " Linux (Ubuntu): sudo apt-get install graphviz\n" + + " Windows: conda install python-graphviz\n" ) - graph = graphviz.Digraph(name=name, format=graph_format, - graph_attr={'splines': 'ortho'}) - graph.attr(rankdir='LR') + graph = graphviz.Digraph( + name=name, format=graph_format, graph_attr={"splines": "ortho"} + ) + graph.attr(rankdir="LR") for component_name, component_class in self.component_instances.items(): - label = '%s\l' % (component_name) # noqa: W605 + label = "%s\l" % (component_name) # noqa: W605 if isinstance(component_class, ComponentBase): - parameters = '\l'.join([key + ' : ' + "{:0.2f}".format(val) if (isinstance(val, float)) - else key + ' : ' + str(val) - for key, val in component_class.parameters.items()]) # noqa: W605 - label = '%s |%s\l' % (component_name, parameters) # noqa: W605 - graph.node(component_name, shape='record', label=label) + parameters = "\l".join( + [ + key + " : " + "{:0.2f}".format(val) + if (isinstance(val, float)) + else key + " : " + str(val) + for key, val in component_class.parameters.items() + ] + ) # noqa: W605 + label = "%s |%s\l" % (component_name, parameters) # noqa: W605 + graph.node(component_name, shape="record", label=label) edges = self._get_edges(self.component_dict) graph.edges(edges) return graph @@ -418,7 +479,7 @@ def _get_edges(component_dict): for component_name, component_info in component_dict.items(): if len(component_info) > 1: for parent in component_info[1:]: - if parent[-2:] == '.x' or parent[-2:] == '.y': + if parent[-2:] == ".x" or parent[-2:] == ".y": parent = parent[:-2] edges.append((parent, component_name)) return edges @@ -434,14 +495,20 @@ def generate_order(cls, component_dict): digraph = nx.DiGraph() digraph.add_edges_from(edges) if not nx.is_weakly_connected(digraph): - raise ValueError('The given graph is not completely connected') + raise ValueError("The given graph is not completely connected") try: compute_order = list(topological_sort(digraph)) except NetworkXUnfeasible: - raise ValueError('The given graph contains a cycle') - end_components = [component for component in compute_order if len(nx.descendants(digraph, component)) == 0] + raise ValueError("The given graph contains a cycle") + end_components = [ + component + for component in compute_order + if len(nx.descendants(digraph, component)) == 0 + ] if len(end_components) != 1: - raise ValueError('The given graph has more than one final (childless) component') + raise ValueError( + "The given graph has more than one final (childless) component" + ) return compute_order def __getitem__(self, index): diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index bfe37fa388..5f4642f9fa 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -23,7 +23,7 @@ KNeighborsClassifier, SVMClassifier, SVMRegressor, - ARIMARegressor + ARIMARegressor, ) from .transformers import ( Transformer, @@ -51,9 +51,6 @@ PolynomialDetrender, SMOTESampler, SMOTENCSampler, - SMOTENSampler -) -from .ensemble import ( - StackedEnsembleClassifier, - StackedEnsembleRegressor + SMOTENSampler, ) +from .ensemble import StackedEnsembleClassifier, StackedEnsembleRegressor diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index d2db8e4fc0..b4bf82dc4d 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -10,7 +10,7 @@ get_logger, infer_feature_types, log_subtitle, - safe_repr + safe_repr, ) logger = get_logger(__file__) @@ -18,6 +18,7 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): """Base class for all components.""" + _default_parameters = None def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): @@ -41,9 +42,9 @@ def model_family(cls): @classproperty def needs_fitting(self): """Returns boolean determining if component needs fitting before - calling predict, predict_proba, transform, or feature_importances. - This can be overridden to False for components that do not need to be fit - or whose fit methods do nothing.""" + calling predict, predict_proba, transform, or feature_importances. + This can be overridden to False for components that do not need to be fit + or whose fit methods do nothing.""" return True @property @@ -55,10 +56,10 @@ def parameters(self): def default_parameters(cls): """Returns the default parameters for this component. - Our convention is that Component.default_parameters == Component().parameters. + Our convention is that Component.default_parameters == Component().parameters. - Returns: - dict: default parameters for this component. + Returns: + dict: default parameters for this component. """ if cls._default_parameters is None: @@ -91,7 +92,9 @@ def fit(self, X, y=None): self._component_obj.fit(X, y) return self except AttributeError: - raise MethodPropertyNotFoundError("Component requires a fit method or a component_obj that implements fit") + raise MethodPropertyNotFoundError( + "Component requires a fit method or a component_obj that implements fit" + ) def describe(self, print_name=False, return_dict=False): """Describe a component and its parameters @@ -107,7 +110,9 @@ def describe(self, print_name=False, return_dict=False): title = self.name log_subtitle(logger, title) for parameter in self.parameters: - parameter_str = ("\t * {} : {}").format(parameter, self.parameters[parameter]) + parameter_str = ("\t * {} : {}").format( + parameter, self.parameters[parameter] + ) logger.info(parameter_str) if return_dict: component_dict = {"name": self.name} @@ -124,7 +129,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): Returns: None """ - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: cloudpickle.dump(self, f, protocol=pickle_protocol) @staticmethod @@ -137,7 +142,7 @@ def load(file_path): Returns: ComponentBase object """ - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return cloudpickle.load(f) def __eq__(self, other): @@ -146,7 +151,7 @@ def __eq__(self, other): random_seed_eq = self.random_seed == other.random_seed if not random_seed_eq: return False - attributes_to_check = ['_parameters', '_is_fitted'] + attributes_to_check = ["_parameters", "_is_fitted"] for attribute in attributes_to_check: if getattr(self, attribute) != getattr(other, attribute): return False @@ -156,5 +161,7 @@ def __str__(self): return self.name def __repr__(self): - parameters_repr = ', '.join([f'{key}={safe_repr(value)}' for key, value in self.parameters.items()]) - return f'{(type(self).__name__)}({parameters_repr})' + parameters_repr = ", ".join( + [f"{key}={safe_repr(value)}" for key, value in self.parameters.items()] + ) + return f"{(type(self).__name__)}({parameters_repr})" diff --git a/evalml/pipelines/components/component_base_meta.py b/evalml/pipelines/components/component_base_meta.py index bfa571e929..bd901b0828 100644 --- a/evalml/pipelines/components/component_base_meta.py +++ b/evalml/pipelines/components/component_base_meta.py @@ -1,5 +1,3 @@ - - from functools import wraps from evalml.exceptions import ComponentNotYetFittedError @@ -12,14 +10,17 @@ class ComponentBaseMeta(BaseMeta): @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the wrapped method if `True`. + It raises an exception if `False` and calls and returns the wrapped method if `True`. """ + @wraps(method) def _check_for_fit(self, X=None, y=None): klass = type(self).__name__ if not self._is_fitted and self.needs_fitting: - raise ComponentNotYetFittedError(f'This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.') - elif method.__name__ == 'inverse_transform': + raise ComponentNotYetFittedError( + f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}." + ) + elif method.__name__ == "inverse_transform": return method(self, X, y) elif X is None and y is None: return method(self) @@ -27,4 +28,5 @@ def _check_for_fit(self, X=None, y=None): return method(self, X) else: return method(self, X, y) + return _check_for_fit diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_base.py b/evalml/pipelines/components/ensemble/stacked_ensemble_base.py index 7c52471ee9..44eb6ea8fd 100644 --- a/evalml/pipelines/components/ensemble/stacked_ensemble_base.py +++ b/evalml/pipelines/components/ensemble/stacked_ensemble_base.py @@ -9,12 +9,21 @@ class StackedEnsembleBase(Estimator): """Stacked Ensemble Base Class.""" + model_family = ModelFamily.ENSEMBLE _stacking_estimator_class = None _default_final_estimator = None _default_cv = None - def __init__(self, input_pipelines=None, final_estimator=None, cv=None, n_jobs=None, random_seed=0, **kwargs): + def __init__( + self, + input_pipelines=None, + final_estimator=None, + cv=None, + n_jobs=None, + random_seed=0, + **kwargs, + ): """Stacked ensemble base class. Arguments: @@ -36,15 +45,25 @@ def __init__(self, input_pipelines=None, final_estimator=None, cv=None, n_jobs=N random_seed (int): Seed for the random number generator. Defaults to 0. """ if not input_pipelines: - raise EnsembleMissingPipelinesError("`input_pipelines` must not be None or an empty list.") - if [pipeline for pipeline in input_pipelines if pipeline.model_family in _nonstackable_model_families]: - raise ValueError("Pipelines with any of the following model families cannot be used as base pipelines: {}".format(_nonstackable_model_families)) + raise EnsembleMissingPipelinesError( + "`input_pipelines` must not be None or an empty list." + ) + if [ + pipeline + for pipeline in input_pipelines + if pipeline.model_family in _nonstackable_model_families + ]: + raise ValueError( + "Pipelines with any of the following model families cannot be used as base pipelines: {}".format( + _nonstackable_model_families + ) + ) parameters = { "input_pipelines": input_pipelines, "final_estimator": final_estimator, "cv": cv, - "n_jobs": n_jobs + "n_jobs": n_jobs, } parameters.update(kwargs) @@ -52,33 +71,43 @@ def __init__(self, input_pipelines=None, final_estimator=None, cv=None, n_jobs=N raise ValueError("All pipelines must have the same problem type.") cv = cv or self._default_cv(n_splits=3, random_state=random_seed, shuffle=True) - estimators = [scikit_learn_wrapped_estimator(pipeline) for pipeline in input_pipelines] - final_estimator = scikit_learn_wrapped_estimator(final_estimator or self._default_final_estimator()) + estimators = [ + scikit_learn_wrapped_estimator(pipeline) for pipeline in input_pipelines + ] + final_estimator = scikit_learn_wrapped_estimator( + final_estimator or self._default_final_estimator() + ) sklearn_parameters = { - "estimators": [(f"({idx})", estimator) for idx, estimator in enumerate(estimators)], + "estimators": [ + (f"({idx})", estimator) for idx, estimator in enumerate(estimators) + ], "final_estimator": final_estimator, "cv": cv, - "n_jobs": n_jobs + "n_jobs": n_jobs, } sklearn_parameters.update(kwargs) - super().__init__(parameters=parameters, - component_obj=self._stacking_estimator_class(**sklearn_parameters), - random_seed=random_seed) + super().__init__( + parameters=parameters, + component_obj=self._stacking_estimator_class(**sklearn_parameters), + random_seed=random_seed, + ) @property def feature_importance(self): """Not implemented for StackedEnsembleClassifier and StackedEnsembleRegressor""" - raise NotImplementedError("feature_importance is not implemented for StackedEnsembleClassifier and StackedEnsembleRegressor") + raise NotImplementedError( + "feature_importance is not implemented for StackedEnsembleClassifier and StackedEnsembleRegressor" + ) @classproperty def default_parameters(cls): """Returns the default parameters for stacked ensemble classes. - Returns: - dict: default parameters for this component. + Returns: + dict: default parameters for this component. """ return { - 'final_estimator': None, - 'cv': None, - 'n_jobs': -1, + "final_estimator": None, + "cv": None, + "n_jobs": -1, } diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py index 2c4e0565c9..f0c3e4df00 100644 --- a/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py +++ b/evalml/pipelines/components/ensemble/stacked_ensemble_classifier.py @@ -9,17 +9,29 @@ class StackedEnsembleClassifier(StackedEnsembleBase): """Stacked Ensemble Classifier.""" + name = "Stacked Ensemble Classifier" model_family = ModelFamily.ENSEMBLE - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] hyperparameter_ranges = {} _stacking_estimator_class = StackingClassifier _default_final_estimator = LogisticRegressionClassifier _default_cv = StratifiedKFold - def __init__(self, input_pipelines=None, final_estimator=None, - cv=None, n_jobs=-1, random_seed=0, **kwargs): + def __init__( + self, + input_pipelines=None, + final_estimator=None, + cv=None, + n_jobs=-1, + random_seed=0, + **kwargs + ): """Stacked ensemble classifier. Arguments: @@ -40,5 +52,11 @@ def __init__(self, input_pipelines=None, final_estimator=None, - Note: there could be some multi-process errors thrown for values of `n_jobs != 1`. If this is the case, please use `n_jobs = 1`. random_seed (int): Seed for the random number generator. Defaults to 0. """ - super().__init__(input_pipelines=input_pipelines, final_estimator=final_estimator, - cv=cv, n_jobs=n_jobs, random_seed=random_seed, **kwargs) + super().__init__( + input_pipelines=input_pipelines, + final_estimator=final_estimator, + cv=cv, + n_jobs=n_jobs, + random_seed=random_seed, + **kwargs + ) diff --git a/evalml/pipelines/components/ensemble/stacked_ensemble_regressor.py b/evalml/pipelines/components/ensemble/stacked_ensemble_regressor.py index 10de6a8d83..74ca051aea 100644 --- a/evalml/pipelines/components/ensemble/stacked_ensemble_regressor.py +++ b/evalml/pipelines/components/ensemble/stacked_ensemble_regressor.py @@ -9,16 +9,27 @@ class StackedEnsembleRegressor(StackedEnsembleBase): """Stacked Ensemble Regressor.""" + name = "Stacked Ensemble Regressor" model_family = ModelFamily.ENSEMBLE - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] hyperparameter_ranges = {} _stacking_estimator_class = StackingRegressor _default_final_estimator = LinearRegressor _default_cv = KFold - def __init__(self, input_pipelines=None, final_estimator=None, - cv=None, n_jobs=-1, random_seed=0, **kwargs): + def __init__( + self, + input_pipelines=None, + final_estimator=None, + cv=None, + n_jobs=-1, + random_seed=0, + **kwargs + ): """Stacked ensemble regressor. Arguments: @@ -39,5 +50,11 @@ def __init__(self, input_pipelines=None, final_estimator=None, - Note: there could be some multi-process errors thrown for values of `n_jobs != 1`. If this is the case, please use `n_jobs = 1`. random_seed (int): Seed for the random number generator. Defaults to 0. """ - super().__init__(input_pipelines=input_pipelines, final_estimator=final_estimator, cv=cv, - n_jobs=n_jobs, random_seed=random_seed, **kwargs) + super().__init__( + input_pipelines=input_pipelines, + final_estimator=final_estimator, + cv=cv, + n_jobs=n_jobs, + random_seed=random_seed, + **kwargs + ) diff --git a/evalml/pipelines/components/estimators/__init__.py b/evalml/pipelines/components/estimators/__init__.py index 737fce4435..f7950da6e7 100644 --- a/evalml/pipelines/components/estimators/__init__.py +++ b/evalml/pipelines/components/estimators/__init__.py @@ -1,24 +1,28 @@ from .estimator import Estimator -from .classifiers import (LogisticRegressionClassifier, - RandomForestClassifier, - XGBoostClassifier, - LightGBMClassifier, - CatBoostClassifier, - ElasticNetClassifier, - ExtraTreesClassifier, - BaselineClassifier, - DecisionTreeClassifier, - KNeighborsClassifier, - SVMClassifier) -from .regressors import (LinearRegressor, - LightGBMRegressor, - RandomForestRegressor, - CatBoostRegressor, - XGBoostRegressor, - ElasticNetRegressor, - ExtraTreesRegressor, - BaselineRegressor, - TimeSeriesBaselineEstimator, - DecisionTreeRegressor, - SVMRegressor, - ARIMARegressor) +from .classifiers import ( + LogisticRegressionClassifier, + RandomForestClassifier, + XGBoostClassifier, + LightGBMClassifier, + CatBoostClassifier, + ElasticNetClassifier, + ExtraTreesClassifier, + BaselineClassifier, + DecisionTreeClassifier, + KNeighborsClassifier, + SVMClassifier, +) +from .regressors import ( + LinearRegressor, + LightGBMRegressor, + RandomForestRegressor, + CatBoostRegressor, + XGBoostRegressor, + ElasticNetRegressor, + ExtraTreesRegressor, + BaselineRegressor, + TimeSeriesBaselineEstimator, + DecisionTreeRegressor, + SVMRegressor, + ARIMARegressor, +) diff --git a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py index 80245df7cc..fb2b932a5b 100644 --- a/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/baseline_classifier.py @@ -1,4 +1,3 @@ - import numpy as np import pandas as pd @@ -13,6 +12,7 @@ class BaselineClassifier(Estimator): This is useful as a simple baseline classifier to compare with other classifiers. """ + name = "Baseline Classifier" hyperparameter_ranges = {} model_family = ModelFamily.BASELINE @@ -26,7 +26,9 @@ def __init__(self, strategy="mode", random_seed=0, **kwargs): random_seed (int): Seed for the random number generator. Defaults to 0. """ if strategy not in ["mode", "random", "random_weighted"]: - raise ValueError("'strategy' parameter must equal either 'mode', 'random', or 'random_weighted'") + raise ValueError( + "'strategy' parameter must equal either 'mode', 'random', or 'random_weighted'" + ) parameters = {"strategy": strategy} parameters.update(kwargs) self._classes = None @@ -34,9 +36,9 @@ def __init__(self, strategy="mode", random_seed=0, **kwargs): self._num_features = None self._num_unique = None self._mode = None - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): if y is None: @@ -60,9 +62,13 @@ def predict(self, X): if strategy == "mode": predictions = pd.Series([self._mode] * len(X)) elif strategy == "random": - predictions = get_random_state(self.random_seed).choice(self._classes, len(X)) + predictions = get_random_state(self.random_seed).choice( + self._classes, len(X) + ) else: - predictions = get_random_state(self.random_seed).choice(self._classes, len(X), p=self._percentage_freq) + predictions = get_random_state(self.random_seed).choice( + self._classes, len(X), p=self._percentage_freq + ) return infer_feature_types(predictions) def predict_proba(self, X): @@ -70,11 +76,18 @@ def predict_proba(self, X): strategy = self.parameters["strategy"] if strategy == "mode": mode_index = self._classes.index(self._mode) - proba_arr = np.array([[1.0 if i == mode_index else 0.0 for i in range(self._num_unique)]] * len(X)) + proba_arr = np.array( + [[1.0 if i == mode_index else 0.0 for i in range(self._num_unique)]] + * len(X) + ) elif strategy == "random": - proba_arr = np.array([[1.0 / self._num_unique for i in range(self._num_unique)]] * len(X)) + proba_arr = np.array( + [[1.0 / self._num_unique for i in range(self._num_unique)]] * len(X) + ) else: - proba_arr = np.array([[self._percentage_freq[i] for i in range(self._num_unique)]] * len(X)) + proba_arr = np.array( + [[self._percentage_freq[i] for i in range(self._num_unique)]] * len(X) + ) predictions = pd.DataFrame(proba_arr, columns=self._classes) return infer_feature_types(predictions) diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py index 41ebc1c906..d4c6a7cca1 100644 --- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py @@ -18,6 +18,7 @@ class CatBoostClassifier(Estimator): For more information, check out https://catboost.ai/ """ + name = "CatBoost Classifier" hyperparameter_ranges = { "n_estimators": Integer(4, 100), @@ -25,35 +26,53 @@ class CatBoostClassifier(Estimator): "max_depth": Integer(4, 10), } model_family = ModelFamily.CATBOOST - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, silent=True, - allow_writing_files=False, random_seed=0, **kwargs): - parameters = {"n_estimators": n_estimators, - "eta": eta, - "max_depth": max_depth, - 'bootstrap_type': bootstrap_type, - 'silent': silent, - 'allow_writing_files': allow_writing_files} + def __init__( + self, + n_estimators=10, + eta=0.03, + max_depth=6, + bootstrap_type=None, + silent=True, + allow_writing_files=False, + random_seed=0, + **kwargs + ): + parameters = { + "n_estimators": n_estimators, + "eta": eta, + "max_depth": max_depth, + "bootstrap_type": bootstrap_type, + "silent": silent, + "allow_writing_files": allow_writing_files, + } parameters.update(kwargs) - cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`" + cb_error_msg = ( + "catboost is not installed. Please install using `pip install catboost.`" + ) catboost = import_or_raise("catboost", error_msg=cb_error_msg) self._label_encoder = None # catboost will choose an intelligent default for bootstrap_type, so only set if provided cb_parameters = copy.copy(parameters) if bootstrap_type is None: - cb_parameters.pop('bootstrap_type') - cb_classifier = catboost.CatBoostClassifier(**cb_parameters, - random_seed=random_seed) - super().__init__(parameters=parameters, - component_obj=cb_classifier, - random_seed=random_seed) + cb_parameters.pop("bootstrap_type") + cb_classifier = catboost.CatBoostClassifier( + **cb_parameters, random_seed=random_seed + ) + super().__init__( + parameters=parameters, component_obj=cb_classifier, random_seed=random_seed + ) def fit(self, X, y=None): X = infer_feature_types(X) - cat_cols = list(X.ww.select('category').columns) + cat_cols = list(X.ww.select("category").columns) self.input_feature_names = list(X.columns) X, y = super()._manage_woodwork(X, y) # For binary classification, catboost expects numeric values, so encoding before. @@ -69,7 +88,9 @@ def predict(self, X): if predictions.ndim == 2 and predictions.shape[1] == 1: predictions = predictions.flatten() if self._label_encoder: - predictions = self._label_encoder.inverse_transform(predictions.astype(np.int64)) + predictions = self._label_encoder.inverse_transform( + predictions.astype(np.int64) + ) return infer_feature_types(predictions) @property diff --git a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py index 466e77f1d2..d663a2b8bb 100644 --- a/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/decision_tree_classifier.py @@ -8,32 +8,40 @@ class DecisionTreeClassifier(Estimator): """Decision Tree Classifier.""" + name = "Decision Tree Classifier" hyperparameter_ranges = { "criterion": ["gini", "entropy"], "max_features": ["auto", "sqrt", "log2"], - "max_depth": Integer(4, 10) + "max_depth": Integer(4, 10), } model_family = ModelFamily.DECISION_TREE - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, - criterion="gini", - max_features="auto", - max_depth=6, - min_samples_split=2, - min_weight_fraction_leaf=0.0, - random_seed=0, - **kwargs): - parameters = {"criterion": criterion, - "max_features": max_features, - "max_depth": max_depth, - "min_samples_split": min_samples_split, - "min_weight_fraction_leaf": min_weight_fraction_leaf} + def __init__( + self, + criterion="gini", + max_features="auto", + max_depth=6, + min_samples_split=2, + min_weight_fraction_leaf=0.0, + random_seed=0, + **kwargs + ): + parameters = { + "criterion": criterion, + "max_features": max_features, + "max_depth": max_depth, + "min_samples_split": min_samples_split, + "min_weight_fraction_leaf": min_weight_fraction_leaf, + } parameters.update(kwargs) - dt_classifier = SKDecisionTreeClassifier(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=dt_classifier, - random_seed=random_seed) + dt_classifier = SKDecisionTreeClassifier(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=dt_classifier, random_seed=random_seed + ) diff --git a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py index 8e51126660..6df1cbd6de 100644 --- a/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/elasticnet_classifier.py @@ -11,33 +11,48 @@ class ElasticNetClassifier(Estimator): """Elastic Net Classifier.""" + name = "Elastic Net Classifier" hyperparameter_ranges = { "alpha": Real(0, 1), "l1_ratio": Real(0, 1), } model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, alpha=0.0001, l1_ratio=0.15, n_jobs=-1, max_iter=1000, - random_seed=0, penalty='elasticnet', - **kwargs): - parameters = {'alpha': alpha, - 'l1_ratio': l1_ratio, - 'n_jobs': n_jobs, - 'max_iter': max_iter, - 'penalty': penalty} - if kwargs.get('loss', 'log') != 'log': - warnings.warn("Parameter loss is being set to 'log' so that ElasticNetClassifier can predict probabilities" - f". Originally received '{kwargs['loss']}'.") + def __init__( + self, + alpha=0.0001, + l1_ratio=0.15, + n_jobs=-1, + max_iter=1000, + random_seed=0, + penalty="elasticnet", + **kwargs, + ): + parameters = { + "alpha": alpha, + "l1_ratio": l1_ratio, + "n_jobs": n_jobs, + "max_iter": max_iter, + "penalty": penalty, + } + if kwargs.get("loss", "log") != "log": + warnings.warn( + "Parameter loss is being set to 'log' so that ElasticNetClassifier can predict probabilities" + f". Originally received '{kwargs['loss']}'." + ) kwargs["loss"] = "log" parameters.update(kwargs) - en_classifier = SKElasticNetClassifier(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=en_classifier, - random_seed=random_seed) + en_classifier = SKElasticNetClassifier(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=en_classifier, random_seed=random_seed + ) @property def feature_importance(self): diff --git a/evalml/pipelines/components/estimators/classifiers/et_classifier.py b/evalml/pipelines/components/estimators/classifiers/et_classifier.py index 06fb0e8993..1273878694 100644 --- a/evalml/pipelines/components/estimators/classifiers/et_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/et_classifier.py @@ -8,34 +8,42 @@ class ExtraTreesClassifier(Estimator): """Extra Trees Classifier.""" + name = "Extra Trees Classifier" hyperparameter_ranges = { "n_estimators": Integer(10, 1000), "max_features": ["auto", "sqrt", "log2"], - "max_depth": Integer(4, 10) + "max_depth": Integer(4, 10), } model_family = ModelFamily.EXTRA_TREES - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, - n_estimators=100, - max_features="auto", - max_depth=6, - min_samples_split=2, - min_weight_fraction_leaf=0.0, - n_jobs=-1, - random_seed=0, - **kwargs): - parameters = {"n_estimators": n_estimators, - "max_features": max_features, - "max_depth": max_depth, - "min_samples_split": min_samples_split, - "min_weight_fraction_leaf": min_weight_fraction_leaf, - "n_jobs": n_jobs} + def __init__( + self, + n_estimators=100, + max_features="auto", + max_depth=6, + min_samples_split=2, + min_weight_fraction_leaf=0.0, + n_jobs=-1, + random_seed=0, + **kwargs + ): + parameters = { + "n_estimators": n_estimators, + "max_features": max_features, + "max_depth": max_depth, + "min_samples_split": min_samples_split, + "min_weight_fraction_leaf": min_weight_fraction_leaf, + "n_jobs": n_jobs, + } parameters.update(kwargs) - et_classifier = SKExtraTreesClassifier(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=et_classifier, - random_seed=random_seed) + et_classifier = SKExtraTreesClassifier(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=et_classifier, random_seed=random_seed + ) diff --git a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py index 69a3395ca2..cbe6da5341 100644 --- a/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/kneighbors_classifier.py @@ -11,36 +11,45 @@ class KNeighborsClassifier(Estimator): """ K-Nearest Neighbors Classifier. """ + name = "KNN Classifier" hyperparameter_ranges = { "n_neighbors": Integer(2, 12), "weights": ["uniform", "distance"], "algorithm": ["auto", "ball_tree", "kd_tree", "brute"], "leaf_size": Integer(10, 30), - "p": Integer(1, 5) + "p": Integer(1, 5), } model_family = ModelFamily.K_NEIGHBORS - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, - n_neighbors=5, - weights="uniform", - algorithm="auto", - leaf_size=30, - p=2, - random_seed=0, - **kwargs): - parameters = {"n_neighbors": n_neighbors, - "weights": weights, - "algorithm": algorithm, - "leaf_size": leaf_size, - "p": p} + def __init__( + self, + n_neighbors=5, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + random_seed=0, + **kwargs + ): + parameters = { + "n_neighbors": n_neighbors, + "weights": weights, + "algorithm": algorithm, + "leaf_size": leaf_size, + "p": p, + } parameters.update(kwargs) knn_classifier = SKKNeighborsClassifier(**parameters) - super().__init__(parameters=parameters, - component_obj=knn_classifier, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=knn_classifier, random_seed=random_seed + ) @property def feature_importance(self): diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py index 49fb842f54..86a94c730a 100644 --- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py @@ -13,12 +13,13 @@ SEED_BOUNDS, _rename_column_names_to_numeric, import_or_raise, - infer_feature_types + infer_feature_types, ) class LightGBMClassifier(Estimator): """LightGBM Classifier""" + name = "LightGBM Classifier" hyperparameter_ranges = { "learning_rate": Real(0.000001, 1), @@ -28,54 +29,80 @@ class LightGBMClassifier(Estimator): "num_leaves": Integer(2, 100), "min_child_samples": Integer(1, 100), "bagging_fraction": Real(0.000001, 1), - "bagging_freq": Integer(0, 1) + "bagging_freq": Integer(0, 1), } model_family = ModelFamily.LIGHTGBM - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] SEED_MIN = 0 SEED_MAX = SEED_BOUNDS.max_bound - def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=100, max_depth=0, num_leaves=31, - min_child_samples=20, n_jobs=-1, random_seed=0, - bagging_fraction=0.9, bagging_freq=0, **kwargs): - parameters = {"boosting_type": boosting_type, - "learning_rate": learning_rate, - "n_estimators": n_estimators, - "max_depth": max_depth, - "num_leaves": num_leaves, - "min_child_samples": min_child_samples, - "n_jobs": n_jobs, - "bagging_freq": bagging_freq, - "bagging_fraction": bagging_fraction} + def __init__( + self, + boosting_type="gbdt", + learning_rate=0.1, + n_estimators=100, + max_depth=0, + num_leaves=31, + min_child_samples=20, + n_jobs=-1, + random_seed=0, + bagging_fraction=0.9, + bagging_freq=0, + **kwargs + ): + parameters = { + "boosting_type": boosting_type, + "learning_rate": learning_rate, + "n_estimators": n_estimators, + "max_depth": max_depth, + "num_leaves": num_leaves, + "min_child_samples": min_child_samples, + "n_jobs": n_jobs, + "bagging_freq": bagging_freq, + "bagging_fraction": bagging_fraction, + } parameters.update(kwargs) lg_parameters = copy.copy(parameters) # when boosting type is random forest (rf), LightGBM requires bagging_freq == 1 and 0 < bagging_fraction < 1.0 if boosting_type == "rf": - lg_parameters['bagging_freq'] = 1 + lg_parameters["bagging_freq"] = 1 # when boosting type is goss, LightGBM requires bagging_fraction == 1 elif boosting_type == "goss": - lg_parameters['bagging_fraction'] = 1 + lg_parameters["bagging_fraction"] = 1 # avoid lightgbm warnings having to do with parameter aliases - if lg_parameters['bagging_freq'] is not None or lg_parameters['bagging_fraction'] is not None: - lg_parameters.update({'subsample': None, 'subsample_freq': None}) + if ( + lg_parameters["bagging_freq"] is not None + or lg_parameters["bagging_fraction"] is not None + ): + lg_parameters.update({"subsample": None, "subsample_freq": None}) - lgbm_error_msg = "LightGBM is not installed. Please install using `pip install lightgbm`." + lgbm_error_msg = ( + "LightGBM is not installed. Please install using `pip install lightgbm`." + ) lgbm = import_or_raise("lightgbm", error_msg=lgbm_error_msg) self._ordinal_encoder = None self._label_encoder = None - lgbm_classifier = lgbm.sklearn.LGBMClassifier(random_state=random_seed, **lg_parameters) + lgbm_classifier = lgbm.sklearn.LGBMClassifier( + random_state=random_seed, **lg_parameters + ) - super().__init__(parameters=parameters, - component_obj=lgbm_classifier, - random_seed=random_seed) + super().__init__( + parameters=parameters, + component_obj=lgbm_classifier, + random_seed=random_seed, + ) def _encode_categories(self, X, fit=False): """Encodes each categorical feature using ordinal encoding.""" X = infer_feature_types(X) - cat_cols = X.ww.select('category').columns + cat_cols = X.ww.select("category").columns if fit: self.input_feature_names = list(X.columns) X_encoded = _rename_column_names_to_numeric(X) @@ -90,7 +117,7 @@ def _encode_categories(self, X, fit=False): else: encoder_output = self._ordinal_encoder.transform(X_encoded[cat_cols]) X_encoded[cat_cols] = pd.DataFrame(encoder_output) - X_encoded[cat_cols] = X_encoded[cat_cols].astype('category') + X_encoded[cat_cols] = X_encoded[cat_cols].astype("category") return X_encoded def _encode_labels(self, y): @@ -98,7 +125,9 @@ def _encode_labels(self, y): # change only if dtype isn't int if not is_integer_dtype(y_encoded): self._label_encoder = LabelEncoder() - y_encoded = pd.Series(self._label_encoder.fit_transform(y_encoded), dtype='int64') + y_encoded = pd.Series( + self._label_encoder.fit_transform(y_encoded), dtype="int64" + ) return y_encoded def fit(self, X, y=None): @@ -113,7 +142,9 @@ def predict(self, X): predictions = super().predict(X_encoded) if not self._label_encoder: return predictions - predictions = pd.Series(self._label_encoder.inverse_transform(predictions.astype(np.int64))) + predictions = pd.Series( + self._label_encoder.inverse_transform(predictions.astype(np.int64)) + ) return infer_feature_types(predictions) def predict_proba(self, X): diff --git a/evalml/pipelines/components/estimators/classifiers/logistic_regression.py b/evalml/pipelines/components/estimators/classifiers/logistic_regression.py index 8eaaf292dc..5d9cfd2ab1 100644 --- a/evalml/pipelines/components/estimators/classifiers/logistic_regression.py +++ b/evalml/pipelines/components/estimators/classifiers/logistic_regression.py @@ -11,27 +11,42 @@ class LogisticRegressionClassifier(Estimator): """ Logistic Regression Classifier. """ + name = "Logistic Regression Classifier" hyperparameter_ranges = { "penalty": ["l2"], - "C": Real(.01, 10), + "C": Real(0.01, 10), } model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, penalty="l2", C=1.0, n_jobs=-1, multi_class="auto", solver="lbfgs", random_seed=0, **kwargs): - parameters = {"penalty": penalty, - "C": C, - "n_jobs": n_jobs, - "multi_class": multi_class, - "solver": solver} + def __init__( + self, + penalty="l2", + C=1.0, + n_jobs=-1, + multi_class="auto", + solver="lbfgs", + random_seed=0, + **kwargs + ): + parameters = { + "penalty": penalty, + "C": C, + "n_jobs": n_jobs, + "multi_class": multi_class, + "solver": solver, + } parameters.update(kwargs) - lr_classifier = LogisticRegression(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=lr_classifier, - random_seed=random_seed) + lr_classifier = LogisticRegression(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=lr_classifier, random_seed=random_seed + ) @property def feature_importance(self): diff --git a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py index 83d767c960..3660cd8462 100644 --- a/evalml/pipelines/components/estimators/classifiers/rf_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/rf_classifier.py @@ -8,22 +8,30 @@ class RandomForestClassifier(Estimator): """Random Forest Classifier.""" + name = "Random Forest Classifier" hyperparameter_ranges = { "n_estimators": Integer(10, 1000), "max_depth": Integer(1, 10), } model_family = ModelFamily.RANDOM_FOREST - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, n_estimators=100, max_depth=6, n_jobs=-1, random_seed=0, **kwargs): - parameters = {"n_estimators": n_estimators, - "max_depth": max_depth, - "n_jobs": n_jobs} + def __init__( + self, n_estimators=100, max_depth=6, n_jobs=-1, random_seed=0, **kwargs + ): + parameters = { + "n_estimators": n_estimators, + "max_depth": max_depth, + "n_jobs": n_jobs, + } parameters.update(kwargs) - rf_classifier = SKRandomForestClassifier(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=rf_classifier, - random_seed=random_seed) + rf_classifier = SKRandomForestClassifier(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=rf_classifier, random_seed=random_seed + ) diff --git a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py index 34b52e685a..576875aeda 100644 --- a/evalml/pipelines/components/estimators/classifiers/svm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/svm_classifier.py @@ -9,40 +9,48 @@ class SVMClassifier(Estimator): """Support Vector Machine Classifier.""" + name = "SVM Classifier" hyperparameter_ranges = { "C": Real(0, 10), "kernel": ["linear", "poly", "rbf", "sigmoid", "precomputed"], - "gamma": ["scale", "auto"] + "gamma": ["scale", "auto"], } model_family = ModelFamily.SVM - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] - def __init__(self, - C=1.0, - kernel="rbf", - gamma="scale", - probability=True, - random_seed=0, - **kwargs): - parameters = {"C": C, - "kernel": kernel, - "gamma": gamma, - "probability": probability} + def __init__( + self, + C=1.0, + kernel="rbf", + gamma="scale", + probability=True, + random_seed=0, + **kwargs + ): + parameters = { + "C": C, + "kernel": kernel, + "gamma": gamma, + "probability": probability, + } parameters.update(kwargs) - svm_classifier = SVC(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=svm_classifier, - random_seed=random_seed) + svm_classifier = SVC(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=svm_classifier, random_seed=random_seed + ) @property def feature_importance(self): """Feature importance only works with linear kernels. If the kernel isn't linear, we return a numpy array of zeros """ - if self._parameters['kernel'] != 'linear': + if self._parameters["kernel"] != "linear": return np.zeros(self._component_obj.n_features_in_) else: return self._component_obj.coef_ diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py index 1671752dc5..8be7d660b6 100644 --- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py @@ -3,14 +3,12 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes -from evalml.utils.gen_utils import ( - _rename_column_names_to_numeric, - import_or_raise -) +from evalml.utils.gen_utils import _rename_column_names_to_numeric, import_or_raise class XGBoostClassifier(Estimator): """XGBoost Classifier.""" + name = "XGBoost Classifier" hyperparameter_ranges = { "eta": Real(0.000001, 1), @@ -19,28 +17,43 @@ class XGBoostClassifier(Estimator): "n_estimators": Integer(1, 1000), } model_family = ModelFamily.XGBOOST - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] # xgboost supports seeds from -2**31 to 2**31 - 1 inclusive. these limits ensure the random seed generated below # is within that range. - SEED_MIN = -2**31 - SEED_MAX = 2**31 - 1 - - def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, random_seed=0, **kwargs): - parameters = {"eta": eta, - "max_depth": max_depth, - "min_child_weight": min_child_weight, - "n_estimators": n_estimators} + SEED_MIN = -(2 ** 31) + SEED_MAX = 2 ** 31 - 1 + + def __init__( + self, + eta=0.1, + max_depth=6, + min_child_weight=1, + n_estimators=100, + random_seed=0, + **kwargs + ): + parameters = { + "eta": eta, + "max_depth": max_depth, + "min_child_weight": min_child_weight, + "n_estimators": n_estimators, + } parameters.update(kwargs) - xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`" + xgb_error_msg = ( + "XGBoost is not installed. Please install using `pip install xgboost.`" + ) xgb = import_or_raise("xgboost", error_msg=xgb_error_msg) - xgb_classifier = xgb.XGBClassifier(random_state=random_seed, - **parameters) + xgb_classifier = xgb.XGBClassifier(random_state=random_seed, **parameters) - super().__init__(parameters=parameters, - component_obj=xgb_classifier, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=xgb_classifier, random_seed=random_seed + ) def fit(self, X, y=None): X, y = super()._manage_woodwork(X, y) diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index bc81706d8c..c2bcb9b67c 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -17,6 +17,7 @@ class Estimator(ComponentBase): To see some examples, check out the definitions of any Estimator component. """ + # We can't use the inspect module to dynamically determine this because of issue 1582 predict_uses_y = False model_family = ModelFamily.NONE @@ -29,7 +30,12 @@ def supported_problem_types(cls): def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): self.input_feature_names = None - super().__init__(parameters=parameters, component_obj=component_obj, random_seed=random_seed, **kwargs) + super().__init__( + parameters=parameters, + component_obj=component_obj, + random_seed=random_seed, + **kwargs + ) def _manage_woodwork(self, X, y=None): """Function to convert the input and target data to Pandas data structures.""" @@ -58,7 +64,9 @@ def predict(self, X): X = infer_feature_types(X) predictions = self._component_obj.predict(X) except AttributeError: - raise MethodPropertyNotFoundError("Estimator requires a predict method or a component_obj that implements predict") + raise MethodPropertyNotFoundError( + "Estimator requires a predict method or a component_obj that implements predict" + ) return infer_feature_types(predictions) def predict_proba(self, X): @@ -74,7 +82,9 @@ def predict_proba(self, X): X = infer_feature_types(X) pred_proba = self._component_obj.predict_proba(X) except AttributeError: - raise MethodPropertyNotFoundError("Estimator requires a predict_proba method or a component_obj that implements predict_proba") + raise MethodPropertyNotFoundError( + "Estimator requires a predict_proba method or a component_obj that implements predict_proba" + ) return infer_feature_types(pred_proba) @property @@ -87,7 +97,12 @@ def feature_importance(self): try: return self._component_obj.feature_importances_ except AttributeError: - raise MethodPropertyNotFoundError("Estimator requires a feature_importance property or a component_obj that implements feature_importances_") + raise MethodPropertyNotFoundError( + "Estimator requires a feature_importance property or a component_obj that implements feature_importances_" + ) def __eq__(self, other): - return super().__eq__(other) and self.supported_problem_types == other.supported_problem_types + return ( + super().__eq__(other) + and self.supported_problem_types == other.supported_problem_types + ) diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py index c9add0a744..97e879dce0 100644 --- a/evalml/pipelines/components/estimators/regressors/arima_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/arima_regressor.py @@ -17,6 +17,7 @@ class ARIMARegressor(Estimator): Currently ARIMARegressor isn't supported via conda install. It's recommended that it be installed via PyPI. """ + name = "ARIMA Regressor" hyperparameter_ranges = { "start_p": Integer(1, 3), @@ -25,13 +26,26 @@ class ARIMARegressor(Estimator): "max_p": Integer(3, 10), "max_d": Integer(2, 5), "max_q": Integer(3, 10), - "seasonal": [True, False] + "seasonal": [True, False], } model_family = ModelFamily.ARIMA supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION] - def __init__(self, date_index=None, trend=None, start_p=2, d=0, start_q=2, max_p=5, max_d=2, max_q=5, seasonal=True, - n_jobs=-1, random_seed=0, **kwargs): + def __init__( + self, + date_index=None, + trend=None, + start_p=2, + d=0, + start_q=2, + max_p=5, + max_d=2, + max_q=5, + seasonal=True, + n_jobs=-1, + random_seed=0, + **kwargs, + ): """ Arguments: date_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. @@ -47,42 +61,54 @@ def __init__(self, date_index=None, trend=None, start_p=2, d=0, start_q=2, max_p seasonal (bool): Whether to fit a seasonal model to ARIMA. """ - parameters = {'trend': trend, - 'start_p': start_p, - 'd': d, - 'start_q': start_q, - 'max_p': max_p, - 'max_d': max_d, - 'max_q': max_q, - 'seasonal': seasonal, - "n_jobs": n_jobs, - "date_index": date_index} + parameters = { + "trend": trend, + "start_p": start_p, + "d": d, + "start_q": start_q, + "max_p": max_p, + "max_d": max_d, + "max_q": max_q, + "seasonal": seasonal, + "n_jobs": n_jobs, + "date_index": date_index, + } parameters.update(kwargs) - arima_model_msg = "sktime is not installed. Please install using `pip install sktime.`" - sktime_arima = import_or_raise("sktime.forecasting.arima", error_msg=arima_model_msg) + arima_model_msg = ( + "sktime is not installed. Please install using `pip install sktime.`" + ) + sktime_arima = import_or_raise( + "sktime.forecasting.arima", error_msg=arima_model_msg + ) arima_model = sktime_arima.AutoARIMA(**parameters) - super().__init__(parameters=parameters, - component_obj=arima_model, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=arima_model, random_seed=random_seed + ) def _get_dates(self, X, y): date_col = None if y is not None: - y_index_type = infer_feature_types(pd.Series(y.index)).ww.logical_type.type_string - if y_index_type == 'datetime': + y_index_type = infer_feature_types( + pd.Series(y.index) + ).ww.logical_type.type_string + if y_index_type == "datetime": date_col = y.index if X is not None: - X_index_type = infer_feature_types(pd.Series(X.index)).ww.logical_type.type_string - if self.parameters['date_index'] in X.columns: - date_col = X.pop(self.parameters['date_index']) - elif X_index_type == 'datetime': + X_index_type = infer_feature_types( + pd.Series(X.index) + ).ww.logical_type.type_string + if self.parameters["date_index"] in X.columns: + date_col = X.pop(self.parameters["date_index"]) + elif X_index_type == "datetime": date_col = X.index if date_col is None: - msg = "ARIMA regressor requires input data X to have a datetime column specified by the 'date_index' parameter. " \ - "If not it will look for the datetime column in the index of X or y." + msg = ( + "ARIMA regressor requires input data X to have a datetime column specified by the 'date_index' parameter. " + "If not it will look for the datetime column in the index of X or y." + ) raise ValueError(msg) return date_col, X @@ -102,14 +128,20 @@ def _format_dates(self, dates, X, y, predict=False): dates.set_index(dates.columns[0], drop=True, inplace=True) dates = pd.DatetimeIndex(dates.index) elif dates.shape[1] > 1: - raise ValueError(f"The dates parameter should not consist of any additional data outside of the datetime information located in the index or in a column." - f" Found {dates.shape[1]} columns.") - freq = 'M' if pd.infer_freq(dates) == 'MS' else pd.infer_freq(dates) + raise ValueError( + f"The dates parameter should not consist of any additional data outside of the datetime information located in the index or in a column." + f" Found {dates.shape[1]} columns." + ) + freq = "M" if pd.infer_freq(dates) == "MS" else pd.infer_freq(dates) dates = dates.to_period(freq=freq) X, y = self._match_indices(X, y, dates) if predict: - arima_model_msg = "sktime is not installed. Please install using `pip install sktime.`" - forecasting_ = import_or_raise("sktime.forecasting.base", error_msg=arima_model_msg) + arima_model_msg = ( + "sktime is not installed. Please install using `pip install sktime.`" + ) + forecasting_ = import_or_raise( + "sktime.forecasting.base", error_msg=arima_model_msg + ) fh_ = forecasting_.ForecastingHorizon(dates, is_relative=False) return X, y, fh_ else: @@ -117,13 +149,13 @@ def _format_dates(self, dates, X, y, predict=False): def fit(self, X, y=None): if y is None: - raise ValueError('ARIMA Regressor requires y as input.') + raise ValueError("ARIMA Regressor requires y as input.") X, y = self._manage_woodwork(X, y) dates, X = self._get_dates(X, y) X, y, _ = self._format_dates(dates, X, y) if X is not None and not X.empty: - X = X.select_dtypes(exclude=['datetime64']) + X = X.select_dtypes(exclude=["datetime64"]) self._component_obj.fit(y=y, X=X) else: self._component_obj.fit(y=y) @@ -134,7 +166,7 @@ def predict(self, X, y=None): dates, X = self._get_dates(X, y) X, y, fh_ = self._format_dates(dates, X, y, predict=True) if X is not None and not X.empty: - X = X.select_dtypes(exclude=['datetime64']) + X = X.select_dtypes(exclude=["datetime64"]) y_pred = self._component_obj.predict(fh=fh_, X=X) else: try: @@ -142,8 +174,10 @@ def predict(self, X, y=None): except ValueError as ve: error = str(ve) if "When an ARIMA is fit with an X array" in error: - raise ValueError("If X was passed to the fit method of the ARIMARegressor, " - "then it must be passed to the predict method as well.") + raise ValueError( + "If X was passed to the fit method of the ARIMARegressor, " + "then it must be passed to the predict method as well." + ) else: raise ve return infer_feature_types(y_pred) diff --git a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py index f55187e69f..acef4fd8e6 100644 --- a/evalml/pipelines/components/estimators/regressors/baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/baseline_regressor.py @@ -12,10 +12,14 @@ class BaselineRegressor(Estimator): This is useful as a simple baseline regressor to compare with other regressors. """ + name = "Baseline Regressor" hyperparameter_ranges = {} model_family = ModelFamily.BASELINE - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] def __init__(self, strategy="mean", random_seed=0, **kwargs): """Baseline regressor that uses a simple strategy to make predictions. @@ -25,15 +29,17 @@ def __init__(self, strategy="mean", random_seed=0, **kwargs): random_seed (int): Seed for the random number generator. Defaults to 0. """ if strategy not in ["mean", "median"]: - raise ValueError("'strategy' parameter must equal either 'mean' or 'median'") + raise ValueError( + "'strategy' parameter must equal either 'mean' or 'median'" + ) parameters = {"strategy": strategy} parameters.update(kwargs) self._prediction_value = None self._num_features = None - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): if y is None: diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index 89469fbe8b..3374391e46 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -15,6 +15,7 @@ class CatBoostRegressor(Estimator): For more information, check out https://catboost.ai/ """ + name = "CatBoost Regressor" hyperparameter_ranges = { "n_estimators": Integer(4, 100), @@ -22,33 +23,50 @@ class CatBoostRegressor(Estimator): "max_depth": Integer(4, 10), } model_family = ModelFamily.CATBOOST - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] - - def __init__(self, n_estimators=10, eta=0.03, max_depth=6, bootstrap_type=None, silent=False, - allow_writing_files=False, random_seed=0, **kwargs): - parameters = {"n_estimators": n_estimators, - "eta": eta, - "max_depth": max_depth, - 'bootstrap_type': bootstrap_type, - 'silent': silent, - 'allow_writing_files': allow_writing_files} + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] + + def __init__( + self, + n_estimators=10, + eta=0.03, + max_depth=6, + bootstrap_type=None, + silent=False, + allow_writing_files=False, + random_seed=0, + **kwargs + ): + parameters = { + "n_estimators": n_estimators, + "eta": eta, + "max_depth": max_depth, + "bootstrap_type": bootstrap_type, + "silent": silent, + "allow_writing_files": allow_writing_files, + } parameters.update(kwargs) - cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`" + cb_error_msg = ( + "catboost is not installed. Please install using `pip install catboost.`" + ) catboost = import_or_raise("catboost", error_msg=cb_error_msg) # catboost will choose an intelligent default for bootstrap_type, so only set if provided cb_parameters = copy.copy(parameters) if bootstrap_type is None: - cb_parameters.pop('bootstrap_type') - cb_regressor = catboost.CatBoostRegressor(**cb_parameters, - random_seed=random_seed) - super().__init__(parameters=parameters, - component_obj=cb_regressor, - random_seed=random_seed) + cb_parameters.pop("bootstrap_type") + cb_regressor = catboost.CatBoostRegressor( + **cb_parameters, random_seed=random_seed + ) + super().__init__( + parameters=parameters, component_obj=cb_regressor, random_seed=random_seed + ) def fit(self, X, y=None): X = infer_feature_types(X) - cat_cols = list(X.ww.select('category').columns) + cat_cols = list(X.ww.select("category").columns) self.input_feature_names = list(X.columns) X, y = super()._manage_woodwork(X, y) self._component_obj.fit(X, y, silent=True, cat_features=cat_cols) diff --git a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py index e29eb4e22a..35ec90a617 100644 --- a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py @@ -8,31 +8,38 @@ class DecisionTreeRegressor(Estimator): """Decision Tree Regressor.""" + name = "Decision Tree Regressor" hyperparameter_ranges = { "criterion": ["mse", "friedman_mse", "mae"], "max_features": ["auto", "sqrt", "log2"], - "max_depth": Integer(4, 10) + "max_depth": Integer(4, 10), } model_family = ModelFamily.DECISION_TREE - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] - def __init__(self, - criterion="mse", - max_features="auto", - max_depth=6, - min_samples_split=2, - min_weight_fraction_leaf=0.0, - random_seed=0, - **kwargs): - parameters = {"criterion": criterion, - "max_features": max_features, - "max_depth": max_depth, - "min_samples_split": min_samples_split, - "min_weight_fraction_leaf": min_weight_fraction_leaf} + def __init__( + self, + criterion="mse", + max_features="auto", + max_depth=6, + min_samples_split=2, + min_weight_fraction_leaf=0.0, + random_seed=0, + **kwargs + ): + parameters = { + "criterion": criterion, + "max_features": max_features, + "max_depth": max_depth, + "min_samples_split": min_samples_split, + "min_weight_fraction_leaf": min_weight_fraction_leaf, + } parameters.update(kwargs) - dt_regressor = SKDecisionTreeRegressor(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=dt_regressor, - random_seed=random_seed) + dt_regressor = SKDecisionTreeRegressor(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=dt_regressor, random_seed=random_seed + ) diff --git a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py index 791404c467..da1ed1eaad 100644 --- a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py @@ -8,25 +8,38 @@ class ElasticNetRegressor(Estimator): """Elastic Net Regressor.""" + name = "Elastic Net Regressor" hyperparameter_ranges = { "alpha": Real(0, 1), "l1_ratio": Real(0, 1), } model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] - def __init__(self, alpha=0.0001, l1_ratio=0.15, max_iter=1000, normalize=False, random_seed=0, **kwargs): - parameters = {'alpha': alpha, - 'l1_ratio': l1_ratio, - 'max_iter': max_iter, - 'normalize': normalize} + def __init__( + self, + alpha=0.0001, + l1_ratio=0.15, + max_iter=1000, + normalize=False, + random_seed=0, + **kwargs + ): + parameters = { + "alpha": alpha, + "l1_ratio": l1_ratio, + "max_iter": max_iter, + "normalize": normalize, + } parameters.update(kwargs) - en_regressor = SKElasticNet(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=en_regressor, - random_seed=random_seed) + en_regressor = SKElasticNet(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=en_regressor, random_seed=random_seed + ) @property def feature_importance(self): diff --git a/evalml/pipelines/components/estimators/regressors/et_regressor.py b/evalml/pipelines/components/estimators/regressors/et_regressor.py index a57da4744a..c7a414afe5 100644 --- a/evalml/pipelines/components/estimators/regressors/et_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/et_regressor.py @@ -8,34 +8,41 @@ class ExtraTreesRegressor(Estimator): """Extra Trees Regressor.""" + name = "Extra Trees Regressor" hyperparameter_ranges = { "n_estimators": Integer(10, 1000), "max_features": ["auto", "sqrt", "log2"], - "max_depth": Integer(4, 10) + "max_depth": Integer(4, 10), } model_family = ModelFamily.EXTRA_TREES - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] - def __init__(self, - n_estimators=100, - max_features="auto", - max_depth=6, - min_samples_split=2, - min_weight_fraction_leaf=0.0, - n_jobs=-1, - random_seed=0, - **kwargs): - parameters = {"n_estimators": n_estimators, - "max_features": max_features, - "max_depth": max_depth, - "min_samples_split": min_samples_split, - "min_weight_fraction_leaf": min_weight_fraction_leaf, - "n_jobs": n_jobs} + def __init__( + self, + n_estimators=100, + max_features="auto", + max_depth=6, + min_samples_split=2, + min_weight_fraction_leaf=0.0, + n_jobs=-1, + random_seed=0, + **kwargs + ): + parameters = { + "n_estimators": n_estimators, + "max_features": max_features, + "max_depth": max_depth, + "min_samples_split": min_samples_split, + "min_weight_fraction_leaf": min_weight_fraction_leaf, + "n_jobs": n_jobs, + } parameters.update(kwargs) - et_regressor = SKExtraTreesRegressor(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=et_regressor, - random_seed=random_seed) + et_regressor = SKExtraTreesRegressor(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=et_regressor, random_seed=random_seed + ) diff --git a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py index 1fa473dd62..cd16c95ab3 100644 --- a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py @@ -11,12 +11,13 @@ SEED_BOUNDS, _rename_column_names_to_numeric, import_or_raise, - infer_feature_types + infer_feature_types, ) class LightGBMRegressor(Estimator): """LightGBM Regressor""" + name = "LightGBM Regressor" hyperparameter_ranges = { "learning_rate": Real(0.000001, 1), @@ -26,7 +27,7 @@ class LightGBMRegressor(Estimator): "num_leaves": Integer(2, 100), "min_child_samples": Integer(1, 100), "bagging_fraction": Real(0.000001, 1), - "bagging_freq": Integer(0, 1) + "bagging_freq": Integer(0, 1), } model_family = ModelFamily.LIGHTGBM supported_problem_types = [ProblemTypes.REGRESSION] @@ -34,45 +35,65 @@ class LightGBMRegressor(Estimator): SEED_MIN = 0 SEED_MAX = SEED_BOUNDS.max_bound - def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=20, max_depth=0, num_leaves=31, - min_child_samples=20, n_jobs=-1, random_seed=0, - bagging_fraction=0.9, bagging_freq=0, **kwargs): + def __init__( + self, + boosting_type="gbdt", + learning_rate=0.1, + n_estimators=20, + max_depth=0, + num_leaves=31, + min_child_samples=20, + n_jobs=-1, + random_seed=0, + bagging_fraction=0.9, + bagging_freq=0, + **kwargs + ): - parameters = {"boosting_type": boosting_type, - "learning_rate": learning_rate, - "n_estimators": n_estimators, - "max_depth": max_depth, - "num_leaves": num_leaves, - "min_child_samples": min_child_samples, - "n_jobs": n_jobs, - "bagging_freq": bagging_freq, - "bagging_fraction": bagging_fraction} + parameters = { + "boosting_type": boosting_type, + "learning_rate": learning_rate, + "n_estimators": n_estimators, + "max_depth": max_depth, + "num_leaves": num_leaves, + "min_child_samples": min_child_samples, + "n_jobs": n_jobs, + "bagging_freq": bagging_freq, + "bagging_fraction": bagging_fraction, + } parameters.update(kwargs) lg_parameters = copy.copy(parameters) # when boosting type is random forest (rf), LightGBM requires bagging_freq == 1 and 0 < bagging_fraction < 1.0 if boosting_type == "rf": - lg_parameters['bagging_freq'] = 1 + lg_parameters["bagging_freq"] = 1 # when boosting type is goss, LightGBM requires bagging_fraction == 1 elif boosting_type == "goss": - lg_parameters['bagging_fraction'] = 1 + lg_parameters["bagging_fraction"] = 1 # avoid lightgbm warnings having to do with parameter aliases - if lg_parameters['bagging_freq'] is not None or lg_parameters['bagging_fraction'] is not None: - lg_parameters.update({'subsample': None, 'subsample_freq': None}) + if ( + lg_parameters["bagging_freq"] is not None + or lg_parameters["bagging_fraction"] is not None + ): + lg_parameters.update({"subsample": None, "subsample_freq": None}) - lgbm_error_msg = "LightGBM is not installed. Please install using `pip install lightgbm`." + lgbm_error_msg = ( + "LightGBM is not installed. Please install using `pip install lightgbm`." + ) lgbm = import_or_raise("lightgbm", error_msg=lgbm_error_msg) self._ordinal_encoder = None - lgbm_regressor = lgbm.sklearn.LGBMRegressor(random_state=random_seed, **lg_parameters) + lgbm_regressor = lgbm.sklearn.LGBMRegressor( + random_state=random_seed, **lg_parameters + ) - super().__init__(parameters=parameters, - component_obj=lgbm_regressor, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=lgbm_regressor, random_seed=random_seed + ) def _encode_categories(self, X, fit=False): """Encodes each categorical feature using ordinal encoding.""" X = infer_feature_types(X) - cat_cols = list(X.ww.select('category').columns) + cat_cols = list(X.ww.select("category").columns) if fit: self.input_feature_names = list(X.columns) X_encoded = _rename_column_names_to_numeric(X) @@ -87,7 +108,7 @@ def _encode_categories(self, X, fit=False): else: encoder_output = self._ordinal_encoder.transform(X_encoded[cat_cols]) X_encoded[cat_cols] = pd.DataFrame(encoder_output) - X_encoded[cat_cols] = X_encoded[cat_cols].astype('category') + X_encoded[cat_cols] = X_encoded[cat_cols].astype("category") return X_encoded def fit(self, X, y=None): diff --git a/evalml/pipelines/components/estimators/regressors/linear_regressor.py b/evalml/pipelines/components/estimators/regressors/linear_regressor.py index 8747e8dbd8..cd7a53b634 100644 --- a/evalml/pipelines/components/estimators/regressors/linear_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/linear_regressor.py @@ -7,25 +7,30 @@ class LinearRegressor(Estimator): """Linear Regressor.""" + name = "Linear Regressor" - hyperparameter_ranges = { - 'fit_intercept': [True, False], - 'normalize': [True, False] - } + hyperparameter_ranges = {"fit_intercept": [True, False], "normalize": [True, False]} model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] - def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1, random_seed=0, **kwargs): + def __init__( + self, fit_intercept=True, normalize=False, n_jobs=-1, random_seed=0, **kwargs + ): parameters = { - 'fit_intercept': fit_intercept, - 'normalize': normalize, - 'n_jobs': n_jobs + "fit_intercept": fit_intercept, + "normalize": normalize, + "n_jobs": n_jobs, } parameters.update(kwargs) linear_regressor = SKLinearRegression(**parameters) - super().__init__(parameters=parameters, - component_obj=linear_regressor, - random_seed=random_seed) + super().__init__( + parameters=parameters, + component_obj=linear_regressor, + random_seed=random_seed, + ) @property def feature_importance(self): diff --git a/evalml/pipelines/components/estimators/regressors/rf_regressor.py b/evalml/pipelines/components/estimators/regressors/rf_regressor.py index ed2395be92..53f210c43f 100644 --- a/evalml/pipelines/components/estimators/regressors/rf_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/rf_regressor.py @@ -8,22 +8,29 @@ class RandomForestRegressor(Estimator): """Random Forest Regressor.""" + name = "Random Forest Regressor" hyperparameter_ranges = { "n_estimators": Integer(10, 1000), "max_depth": Integer(1, 32), } model_family = ModelFamily.RANDOM_FOREST - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] - def __init__(self, n_estimators=100, max_depth=6, n_jobs=-1, random_seed=0, **kwargs): - parameters = {"n_estimators": n_estimators, - "max_depth": max_depth, - "n_jobs": n_jobs} + def __init__( + self, n_estimators=100, max_depth=6, n_jobs=-1, random_seed=0, **kwargs + ): + parameters = { + "n_estimators": n_estimators, + "max_depth": max_depth, + "n_jobs": n_jobs, + } parameters.update(kwargs) - rf_regressor = SKRandomForestRegressor(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=rf_regressor, - random_seed=random_seed) + rf_regressor = SKRandomForestRegressor(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=rf_regressor, random_seed=random_seed + ) diff --git a/evalml/pipelines/components/estimators/regressors/svm_regressor.py b/evalml/pipelines/components/estimators/regressors/svm_regressor.py index d0059aac0c..6eb18a9b01 100644 --- a/evalml/pipelines/components/estimators/regressors/svm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/svm_regressor.py @@ -9,38 +9,35 @@ class SVMRegressor(Estimator): """Support Vector Machine Regressor.""" + name = "SVM Regressor" hyperparameter_ranges = { "C": Real(0, 10), "kernel": ["linear", "poly", "rbf", "sigmoid", "precomputed"], - "gamma": ["scale", "auto"] + "gamma": ["scale", "auto"], } model_family = ModelFamily.SVM - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] - def __init__(self, - C=1.0, - kernel="rbf", - gamma="scale", - random_seed=0, - **kwargs): - parameters = {"C": C, - "kernel": kernel, - "gamma": gamma} + def __init__(self, C=1.0, kernel="rbf", gamma="scale", random_seed=0, **kwargs): + parameters = {"C": C, "kernel": kernel, "gamma": gamma} parameters.update(kwargs) # SVR doesn't take a random_state arg svm_regressor = SVR(**parameters) - super().__init__(parameters=parameters, - component_obj=svm_regressor, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=svm_regressor, random_seed=random_seed + ) @property def feature_importance(self): """Feature importance only works with linear kernels. If the kernel isn't linear, we return a numpy array of zeros """ - if self._parameters['kernel'] != 'linear': + if self._parameters["kernel"] != "linear": return np.zeros(self._component_obj.n_features_in_) else: return self._component_obj.coef_ diff --git a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py index 6af541904c..f5fc23267c 100644 --- a/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py +++ b/evalml/pipelines/components/estimators/regressors/time_series_baseline_estimator.py @@ -12,11 +12,15 @@ class TimeSeriesBaselineEstimator(Estimator): This is useful as a simple baseline estimator for time series problems """ + name = "Time Series Baseline Estimator" hyperparameter_ranges = {} model_family = ModelFamily.BASELINE - supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS] + supported_problem_types = [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] predict_uses_y = True def __init__(self, gap=1, random_seed=0, **kwargs): @@ -34,13 +38,15 @@ def __init__(self, gap=1, random_seed=0, **kwargs): self.gap = gap if gap < 0: - raise ValueError(f'gap value must be a positive integer. {gap} was provided.') + raise ValueError( + f"gap value must be a positive integer. {gap} was provided." + ) parameters = {"gap": gap} parameters.update(kwargs) - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): if X is None: @@ -51,7 +57,9 @@ def fit(self, X, y=None): def predict(self, X, y=None): if y is None: - raise ValueError("Cannot predict Time Series Baseline Estimator if y is None") + raise ValueError( + "Cannot predict Time Series Baseline Estimator if y is None" + ) y = infer_feature_types(y) if self.gap == 0: @@ -61,9 +69,11 @@ def predict(self, X, y=None): def predict_proba(self, X, y=None): if y is None: - raise ValueError("Cannot predict Time Series Baseline Estimator if y is None") + raise ValueError( + "Cannot predict Time Series Baseline Estimator if y is None" + ) y = infer_feature_types(y) - preds = self.predict(X, y).dropna(axis=0, how='any').astype('int') + preds = self.predict(X, y).dropna(axis=0, how="any").astype("int") proba_arr = np.zeros((len(preds), y.max() + 1)) proba_arr[np.arange(len(preds)), preds] = 1 padded = pad_with_nans(pd.DataFrame(proba_arr), len(y) - len(preds)) diff --git a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py index cf380f01aa..bea64090d6 100644 --- a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py @@ -3,14 +3,12 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes -from evalml.utils.gen_utils import ( - _rename_column_names_to_numeric, - import_or_raise -) +from evalml.utils.gen_utils import _rename_column_names_to_numeric, import_or_raise class XGBoostRegressor(Estimator): """XGBoost Regressor.""" + name = "XGBoost Regressor" hyperparameter_ranges = { "eta": Real(0.000001, 1), @@ -19,27 +17,41 @@ class XGBoostRegressor(Estimator): "n_estimators": Integer(1, 1000), } model_family = ModelFamily.XGBOOST - supported_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + supported_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] # xgboost supports seeds from -2**31 to 2**31 - 1 inclusive. these limits ensure the random seed generated below # is within that range. - SEED_MIN = -2**31 - SEED_MAX = 2**31 - 1 - - def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, random_seed=0, **kwargs): - parameters = {"eta": eta, - "max_depth": max_depth, - "min_child_weight": min_child_weight, - "n_estimators": n_estimators} + SEED_MIN = -(2 ** 31) + SEED_MAX = 2 ** 31 - 1 + + def __init__( + self, + eta=0.1, + max_depth=6, + min_child_weight=1, + n_estimators=100, + random_seed=0, + **kwargs + ): + parameters = { + "eta": eta, + "max_depth": max_depth, + "min_child_weight": min_child_weight, + "n_estimators": n_estimators, + } parameters.update(kwargs) - xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`" + xgb_error_msg = ( + "XGBoost is not installed. Please install using `pip install xgboost.`" + ) xgb = import_or_raise("xgboost", error_msg=xgb_error_msg) - xgb_Regressor = xgb.XGBRegressor(random_state=random_seed, - **parameters) - super().__init__(parameters=parameters, - component_obj=xgb_Regressor, - random_seed=random_seed) + xgb_Regressor = xgb.XGBRegressor(random_state=random_seed, **parameters) + super().__init__( + parameters=parameters, component_obj=xgb_Regressor, random_seed=random_seed + ) def fit(self, X, y=None): X, y = super()._manage_woodwork(X, y) diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py index 0cdefb280f..474f24a973 100644 --- a/evalml/pipelines/components/transformers/__init__.py +++ b/evalml/pipelines/components/transformers/__init__.py @@ -1,9 +1,21 @@ from .transformer import Transformer from .encoders import OneHotEncoder, TargetEncoder -from .feature_selection import FeatureSelector, RFClassifierSelectFromModel, RFRegressorSelectFromModel +from .feature_selection import ( + FeatureSelector, + RFClassifierSelectFromModel, + RFRegressorSelectFromModel, +) from .imputers import PerColumnImputer, SimpleImputer, Imputer, TargetImputer from .scalers import StandardScaler from .samplers import Undersampler, SMOTESampler, SMOTENCSampler, SMOTENSampler from .column_selectors import DropColumns, SelectColumns from .dimensionality_reduction import LinearDiscriminantAnalysis, PCA -from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer, DelayedFeatureTransformer, DFSTransformer, PolynomialDetrender +from .preprocessing import ( + DateTimeFeaturizer, + DropNullColumns, + LSA, + TextFeaturizer, + DelayedFeatureTransformer, + DFSTransformer, + PolynomialDetrender, +) diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py index 39a4eb5ac8..344fa20b1b 100644 --- a/evalml/pipelines/components/transformers/column_selectors.py +++ b/evalml/pipelines/components/transformers/column_selectors.py @@ -5,7 +5,6 @@ class ColumnSelector(Transformer): - def __init__(self, columns=None, random_seed=0, **kwargs): """Initalizes an transformer that drops specified columns in input data. @@ -13,13 +12,15 @@ def __init__(self, columns=None, random_seed=0, **kwargs): columns (list(string)): List of column names, used to determine which columns to drop. """ if columns and not isinstance(columns, list): - raise ValueError(f"Parameter columns must be a list. Received {type(columns)}.") + raise ValueError( + f"Parameter columns must be a list. Received {type(columns)}." + ) parameters = {"columns": columns} parameters.update(kwargs) - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def _check_input_for_columns(self, X): cols = self.parameters.get("columns") or [] @@ -29,7 +30,9 @@ def _check_input_for_columns(self, X): missing_cols = set(cols) - set(column_names) if missing_cols: raise ValueError( - "Columns {} not found in input data".format(', '.join(f"'{col_name}'" for col_name in missing_cols)) + "Columns {} not found in input data".format( + ", ".join(f"'{col_name}'" for col_name in missing_cols) + ) ) @abstractmethod @@ -60,6 +63,7 @@ def transform(self, X, y=None): class DropColumns(ColumnSelector): """Drops specified columns in input data.""" + name = "Drop Columns Transformer" hyperparameter_ranges = {} needs_fitting = False @@ -82,6 +86,7 @@ def transform(self, X, y=None): class SelectColumns(ColumnSelector): """Selects specified columns in input data.""" + name = "Select Columns Transformer" hyperparameter_ranges = {} needs_fitting = False diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py index dceb448289..10d5c4ead5 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/lda.py @@ -5,13 +5,14 @@ from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, infer_feature_types, - is_all_numeric + is_all_numeric, ) class LinearDiscriminantAnalysis(Transformer): """Reduces the number of features by using Linear Discriminant Analysis""" - name = 'Linear Discriminant Analysis Transformer' + + name = "Linear Discriminant Analysis Transformer" hyperparameter_ranges = {} def __init__(self, n_components=None, random_seed=0, **kwargs): @@ -22,13 +23,15 @@ def __init__(self, n_components=None, random_seed=0, **kwargs): random_seed (int): Seed for the random number generator. Defaults to 0. """ if n_components and n_components < 1: - raise ValueError("Invalid number of compponents for Linear Discriminant Analysis") + raise ValueError( + "Invalid number of compponents for Linear Discriminant Analysis" + ) parameters = {"n_components": n_components} parameters.update(kwargs) lda = SkLDA(n_components=n_components, **kwargs) - super().__init__(parameters=parameters, - component_obj=lda, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=lda, random_seed=random_seed + ) def fit(self, X, y): X = infer_feature_types(X) @@ -37,7 +40,7 @@ def fit(self, X, y): y = infer_feature_types(y) n_features = X.shape[1] n_classes = y.nunique() - n_components = self.parameters['n_components'] + n_components = self.parameters["n_components"] if n_components is not None and n_components > min(n_classes, n_features): raise ValueError(f"n_components value {n_components} is too large") @@ -49,7 +52,11 @@ def transform(self, X, y=None): if not is_all_numeric(X_ww): raise ValueError("LDA input must be all numeric") X_t = self._component_obj.transform(X) - X_t = pd.DataFrame(X_t, index=X_ww.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) + X_t = pd.DataFrame( + X_t, + index=X_ww.index, + columns=[f"component_{i}" for i in range(X_t.shape[1])], + ) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t) def fit_transform(self, X, y=None): @@ -59,5 +66,9 @@ def fit_transform(self, X, y=None): y = infer_feature_types(y) X_t = self._component_obj.fit_transform(X, y) - X_t = pd.DataFrame(X_t, index=X_ww.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) + X_t = pd.DataFrame( + X_t, + index=X_ww.index, + columns=[f"component_{i}" for i in range(X_t.shape[1])], + ) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t) diff --git a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py index 38e8f04503..db29912a99 100644 --- a/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py +++ b/evalml/pipelines/components/transformers/dimensionality_reduction/pca.py @@ -6,15 +6,15 @@ from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, infer_feature_types, - is_all_numeric + is_all_numeric, ) class PCA(Transformer): """Reduces the number of features by using Principal Component Analysis""" - name = 'PCA Transformer' - hyperparameter_ranges = { - "variance": Real(0.25, 1)} + + name = "PCA Transformer" + hyperparameter_ranges = {"variance": Real(0.25, 1)} def __init__(self, variance=0.95, n_components=None, random_seed=0, **kwargs): """Initalizes an transformer that reduces the number of features using PCA." @@ -26,16 +26,15 @@ def __init__(self, variance=0.95, n_components=None, random_seed=0, **kwargs): variance variable if set. random_seed (int): Seed for the random number generator. Defaults to 0. """ - parameters = {"variance": variance, - "n_components": n_components} + parameters = {"variance": variance, "n_components": n_components} parameters.update(kwargs) if n_components: pca = SkPCA(n_components=n_components, random_state=random_seed, **kwargs) else: pca = SkPCA(n_components=variance, random_state=random_seed, **kwargs) - super().__init__(parameters=parameters, - component_obj=pca, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=pca, random_seed=random_seed + ) def fit(self, X, y=None): X = infer_feature_types(X) @@ -49,7 +48,11 @@ def transform(self, X, y=None): if not is_all_numeric(X_ww): raise ValueError("PCA input must be all numeric") X_t = self._component_obj.transform(X) - X_t = pd.DataFrame(X_t, index=X_ww.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) + X_t = pd.DataFrame( + X_t, + index=X_ww.index, + columns=[f"component_{i}" for i in range(X_t.shape[1])], + ) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t) def fit_transform(self, X, y=None): @@ -57,5 +60,9 @@ def fit_transform(self, X, y=None): if not is_all_numeric(X_ww): raise ValueError("PCA input must be all numeric") X_t = self._component_obj.fit_transform(X, y) - X_t = pd.DataFrame(X_t, index=X_ww.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) + X_t = pd.DataFrame( + X_t, + index=X_ww.index, + columns=[f"component_{i}" for i in range(X_t.shape[1])], + ) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t) diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py index 955d7a036e..6b1cc28c88 100644 --- a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py @@ -6,29 +6,36 @@ from evalml.pipelines.components.transformers.transformer import Transformer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) class OneHotEncoderMeta(ComponentBaseMeta): """A version of the ComponentBaseMeta class which includes validation on an additional one-hot-encoder-specific method `categories`""" - METHODS_TO_CHECK = ComponentBaseMeta.METHODS_TO_CHECK + ['categories', 'get_feature_names'] + + METHODS_TO_CHECK = ComponentBaseMeta.METHODS_TO_CHECK + [ + "categories", + "get_feature_names", + ] class OneHotEncoder(Transformer, metaclass=OneHotEncoderMeta): """One-hot encoder to encode non-numeric data.""" - name = 'One Hot Encoder' + + name = "One Hot Encoder" hyperparameter_ranges = {} - def __init__(self, - top_n=10, - features_to_encode=None, - categories=None, - drop='if_binary', - handle_unknown="ignore", - handle_missing="error", - random_seed=0, - **kwargs): + def __init__( + self, + top_n=10, + features_to_encode=None, + categories=None, + drop="if_binary", + handle_unknown="ignore", + handle_missing="error", + random_seed=0, + **kwargs, + ): """Initalizes an transformer that encodes categorical features in a one-hot numeric array." Arguments: @@ -49,79 +56,103 @@ def __init__(self, values encountered will raise an error. Defaults to "error". random_seed (int): Seed for the random number generator. Defaults to 0. """ - parameters = {"top_n": top_n, - "features_to_encode": features_to_encode, - "categories": categories, - "drop": drop, - "handle_unknown": handle_unknown, - "handle_missing": handle_missing} + parameters = { + "top_n": top_n, + "features_to_encode": features_to_encode, + "categories": categories, + "drop": drop, + "handle_unknown": handle_unknown, + "handle_missing": handle_missing, + } parameters.update(kwargs) # Check correct inputs unknown_input_options = ["ignore", "error"] missing_input_options = ["as_category", "error"] if handle_unknown not in unknown_input_options: - raise ValueError("Invalid input {} for handle_unknown".format(handle_unknown)) + raise ValueError( + "Invalid input {} for handle_unknown".format(handle_unknown) + ) if handle_missing not in missing_input_options: - raise ValueError("Invalid input {} for handle_missing".format(handle_missing)) + raise ValueError( + "Invalid input {} for handle_missing".format(handle_missing) + ) if top_n is not None and categories is not None: raise ValueError("Cannot use categories and top_n arguments simultaneously") self.features_to_encode = features_to_encode self._encoder = None - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) self._initial_state = self.random_seed self._provenance = {} @staticmethod def _get_cat_cols(X): """Get names of categorical columns in the input DataFrame.""" - return list(X.ww.select(include=['category']).columns) + return list(X.ww.select(include=["category"]).columns) def fit(self, X, y=None): - top_n = self.parameters['top_n'] + top_n = self.parameters["top_n"] X = infer_feature_types(X) if self.features_to_encode is None: self.features_to_encode = self._get_cat_cols(X) X_t = X - invalid_features = [col for col in self.features_to_encode if col not in list(X.columns)] + invalid_features = [ + col for col in self.features_to_encode if col not in list(X.columns) + ] if len(invalid_features) > 0: - raise ValueError("Could not find and encode {} in input data.".format(', '.join(invalid_features))) + raise ValueError( + "Could not find and encode {} in input data.".format( + ", ".join(invalid_features) + ) + ) X_t = self._handle_parameter_handle_missing(X_t) self._binary_values_to_drop = [] if len(self.features_to_encode) == 0: - categories = 'auto' - elif self.parameters['categories'] is not None: - categories = self.parameters['categories'] - if len(categories) != len(self.features_to_encode) or not isinstance(categories[0], list): - raise ValueError('Categories argument must contain a list of categories for each categorical feature') + categories = "auto" + elif self.parameters["categories"] is not None: + categories = self.parameters["categories"] + if len(categories) != len(self.features_to_encode) or not isinstance( + categories[0], list + ): + raise ValueError( + "Categories argument must contain a list of categories for each categorical feature" + ) else: categories = [] for col in X_t[self.features_to_encode]: value_counts = X_t[col].value_counts(dropna=False).to_frame() - if self.parameters['drop'] == "if_binary" and len(value_counts) == 2: + if self.parameters["drop"] == "if_binary" and len(value_counts) == 2: majority_class_value = value_counts.index.tolist()[0] self._binary_values_to_drop.append((col, majority_class_value)) if top_n is None or len(value_counts) <= top_n: unique_values = value_counts.index.tolist() else: - value_counts = value_counts.sample(frac=1, random_state=self._initial_state) - value_counts = value_counts.sort_values([col], ascending=False, kind='mergesort') + value_counts = value_counts.sample( + frac=1, random_state=self._initial_state + ) + value_counts = value_counts.sort_values( + [col], ascending=False, kind="mergesort" + ) unique_values = value_counts.head(top_n).index.tolist() unique_values = np.sort(unique_values) categories.append(unique_values) # Create an encoder to pass off the rest of the computation to # if "drop" is set to "if_binary", pass None to scikit-learn because we manually handle - drop_to_use = None if self.parameters['drop'] == "if_binary" else self.parameters['drop'] - self._encoder = SKOneHotEncoder(categories=categories, - drop=drop_to_use, - handle_unknown=self.parameters['handle_unknown']) + drop_to_use = ( + None if self.parameters["drop"] == "if_binary" else self.parameters["drop"] + ) + self._encoder = SKOneHotEncoder( + categories=categories, + drop=drop_to_use, + handle_unknown=self.parameters["handle_unknown"], + ) self._encoder.fit(X_t[self.features_to_encode]) return self @@ -144,7 +175,10 @@ def transform(self, X, y=None): # Call sklearn's transform on the categorical columns if len(self.features_to_encode) > 0: - X_cat = pd.DataFrame(self._encoder.transform(X_copy[self.features_to_encode]).toarray(), index=X_copy.index) + X_cat = pd.DataFrame( + self._encoder.transform(X_copy[self.features_to_encode]).toarray(), + index=X_copy.index, + ) X_cat.columns = self._get_feature_names() X_cat.drop(columns=self._features_to_drop, inplace=True) X_cat.ww.init(logical_types={c: "Boolean" for c in X_cat.columns}) @@ -157,13 +191,13 @@ def transform(self, X, y=None): def _handle_parameter_handle_missing(self, X): """Helper method to handle the `handle_missing` parameter.""" cat_cols = self.features_to_encode - if self.parameters['handle_missing'] == "error" and X.isnull().any().any(): + if self.parameters["handle_missing"] == "error" and X.isnull().any().any(): raise ValueError("Input contains NaN") - if self.parameters['handle_missing'] == "as_category": + if self.parameters["handle_missing"] == "as_category": for col in cat_cols: - if X[col].dtype == 'category' and pd.isna(X[col]).any(): + if X[col].dtype == "category" and pd.isna(X[col]).any(): X[col] = X[col].cat.add_categories("nan") - X[col] = X[col].where(~pd.isna(X[col]), other='nan') + X[col] = X[col].where(~pd.isna(X[col]), other="nan") X[col] = X[col].replace(np.nan, "nan") return X @@ -178,7 +212,9 @@ def categories(self, feature_name): try: index = self.features_to_encode.index(feature_name) except Exception: - raise ValueError(f'Feature "{feature_name}" was not provided to one-hot encoder as a training feature') + raise ValueError( + f'Feature "{feature_name}" was not provided to one-hot encoder as a training feature' + ) return self._encoder.categories_[index] @staticmethod @@ -219,7 +255,10 @@ def _get_feature_names(self): for cat_index, category in enumerate(column_categories): # Drop categories specified by the user - if self._encoder.drop_idx_ is not None and self._encoder.drop_idx_[col_index] is not None: + if ( + self._encoder.drop_idx_ is not None + and self._encoder.drop_idx_[col_index] is not None + ): if cat_index == self._encoder.drop_idx_[col_index]: continue diff --git a/evalml/pipelines/components/transformers/encoders/target_encoder.py b/evalml/pipelines/components/transformers/encoders/target_encoder.py index 6c04aaae40..bb9a008330 100644 --- a/evalml/pipelines/components/transformers/encoders/target_encoder.py +++ b/evalml/pipelines/components/transformers/encoders/target_encoder.py @@ -4,27 +4,30 @@ from ..transformer import Transformer from evalml.pipelines.components.transformers.encoders.onehot_encoder import ( - OneHotEncoderMeta + OneHotEncoderMeta, ) from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, import_or_raise, - infer_feature_types + infer_feature_types, ) class TargetEncoder(Transformer, metaclass=OneHotEncoderMeta): """Target encoder to encode categorical data""" - name = 'Target Encoder' + + name = "Target Encoder" hyperparameter_ranges = {} - def __init__(self, - cols=None, - smoothing=1.0, - handle_unknown='value', - handle_missing='value', - random_seed=0, - **kwargs): + def __init__( + self, + cols=None, + smoothing=1.0, + handle_unknown="value", + handle_missing="value", + random_seed=0, + **kwargs + ): """Initializes a transformer that encodes categorical features into target encodings. Arguments: @@ -37,26 +40,41 @@ def __init__(self, handle_missing (string): Determines how to handle missing values encountered during `fit` or `transform`. Options are 'value', 'error', and 'return_nan'. Defaults to 'value', which replaces with the target mean random_seed (int): Seed for the random number generator. Defaults to 0. - """ + """ - parameters = {"cols": cols, - "smoothing": smoothing, - "handle_unknown": handle_unknown, - "handle_missing": handle_missing} + parameters = { + "cols": cols, + "smoothing": smoothing, + "handle_unknown": handle_unknown, + "handle_missing": handle_missing, + } parameters.update(kwargs) - unknown_and_missing_input_options = ['error', 'return_nan', 'value'] + unknown_and_missing_input_options = ["error", "return_nan", "value"] if handle_unknown not in unknown_and_missing_input_options: - raise ValueError("Invalid input '{}' for handle_unknown".format(handle_unknown)) + raise ValueError( + "Invalid input '{}' for handle_unknown".format(handle_unknown) + ) if handle_missing not in unknown_and_missing_input_options: - raise ValueError("Invalid input '{}' for handle_missing".format(handle_missing)) + raise ValueError( + "Invalid input '{}' for handle_missing".format(handle_missing) + ) if smoothing <= 0: - raise ValueError("Smoothing value needs to be strictly larger than 0. {} provided".format(smoothing)) + raise ValueError( + "Smoothing value needs to be strictly larger than 0. {} provided".format( + smoothing + ) + ) - category_encode = import_or_raise('category_encoders', error_msg='category_encoders not installed. Please install using `pip install category_encoders`') - super().__init__(parameters=parameters, - component_obj=category_encode.target_encoder.TargetEncoder(**parameters), - random_seed=random_seed) + category_encode = import_or_raise( + "category_encoders", + error_msg="category_encoders not installed. Please install using `pip install category_encoders`", + ) + super().__init__( + parameters=parameters, + component_obj=category_encode.target_encoder.TargetEncoder(**parameters), + random_seed=random_seed, + ) def fit(self, X, y): return super().fit(X, y) @@ -67,7 +85,9 @@ def transform(self, X, y=None): y = infer_feature_types(y) X_t = self._component_obj.transform(X, y) X_t_df = pd.DataFrame(X_t, columns=X_ww.columns, index=X_ww.index) - return _retain_custom_types_and_initalize_woodwork(X_ww.ww.logical_types, X_t_df, ltypes_to_ignore=[Categorical]) + return _retain_custom_types_and_initalize_woodwork( + X_ww.ww.logical_types, X_t_df, ltypes_to_ignore=[Categorical] + ) def fit_transform(self, X, y): return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py index dc1367efb8..24a71cca78 100644 --- a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py @@ -4,7 +4,7 @@ from evalml.pipelines.components.transformers import Transformer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) @@ -18,7 +18,13 @@ def get_names(self): list[str]: List of the names of features selected """ selected_masks = self._component_obj.get_support() - return [feature_name for (selected, feature_name) in zip(selected_masks, self.input_feature_names) if selected] + return [ + feature_name + for (selected, feature_name) in zip( + selected_masks, self.input_feature_names + ) + if selected + ] def transform(self, X, y=None): """Transforms input data by selecting features. If the component_obj does not have a transform method, will raise an MethodPropertyNotFoundError exception. @@ -36,13 +42,19 @@ def transform(self, X, y=None): try: X_t = self._component_obj.transform(X) except AttributeError: - raise MethodPropertyNotFoundError("Feature selector requires a transform method or a component_obj that implements transform") + raise MethodPropertyNotFoundError( + "Feature selector requires a transform method or a component_obj that implements transform" + ) X_dtypes = X_ww.dtypes.to_dict() selected_col_names = self.get_names() col_types = {key: X_dtypes[key] for key in selected_col_names} - features = pd.DataFrame(X_t, columns=selected_col_names, index=X_ww.index).astype(col_types) - return _retain_custom_types_and_initalize_woodwork(X_ww.ww.logical_types, features) + features = pd.DataFrame( + X_t, columns=selected_col_names, index=X_ww.index + ).astype(col_types) + return _retain_custom_types_and_initalize_woodwork( + X_ww.ww.logical_types, features + ) def fit_transform(self, X, y=None): return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py index 1a33fbcec3..50533d7739 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_classifier_feature_selector.py @@ -8,31 +8,51 @@ class RFClassifierSelectFromModel(FeatureSelector): """Selects top features based on importance weights using a Random Forest classifier.""" - name = 'RF Classifier Select From Model' + + name = "RF Classifier Select From Model" hyperparameter_ranges = { - "percent_features": Real(.01, 1), - "threshold": ['mean', -np.inf] + "percent_features": Real(0.01, 1), + "threshold": ["mean", -np.inf], } - def __init__(self, number_features=None, n_estimators=10, max_depth=None, - percent_features=0.5, threshold=-np.inf, n_jobs=-1, random_seed=0, **kwargs): - parameters = {"number_features": number_features, - "n_estimators": n_estimators, - "max_depth": max_depth, - "percent_features": percent_features, - "threshold": threshold, - "n_jobs": n_jobs} + def __init__( + self, + number_features=None, + n_estimators=10, + max_depth=None, + percent_features=0.5, + threshold=-np.inf, + n_jobs=-1, + random_seed=0, + **kwargs + ): + parameters = { + "number_features": number_features, + "n_estimators": n_estimators, + "max_depth": max_depth, + "percent_features": percent_features, + "threshold": threshold, + "n_jobs": n_jobs, + } parameters.update(kwargs) - estimator = SKRandomForestClassifier(random_state=random_seed, - n_estimators=n_estimators, - max_depth=max_depth, - n_jobs=n_jobs) - max_features = max(1, int(percent_features * number_features)) if number_features else None - feature_selection = SkSelect(estimator=estimator, - max_features=max_features, - threshold=threshold, - **kwargs) - super().__init__(parameters=parameters, - component_obj=feature_selection, - random_seed=random_seed) + estimator = SKRandomForestClassifier( + random_state=random_seed, + n_estimators=n_estimators, + max_depth=max_depth, + n_jobs=n_jobs, + ) + max_features = ( + max(1, int(percent_features * number_features)) if number_features else None + ) + feature_selection = SkSelect( + estimator=estimator, + max_features=max_features, + threshold=threshold, + **kwargs + ) + super().__init__( + parameters=parameters, + component_obj=feature_selection, + random_seed=random_seed, + ) diff --git a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py index 82d513781a..021206f466 100644 --- a/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py +++ b/evalml/pipelines/components/transformers/feature_selection/rf_regressor_feature_selector.py @@ -8,32 +8,52 @@ class RFRegressorSelectFromModel(FeatureSelector): """Selects top features based on importance weights using a Random Forest regressor.""" - name = 'RF Regressor Select From Model' + + name = "RF Regressor Select From Model" hyperparameter_ranges = { - "percent_features": Real(.01, 1), - "threshold": ['mean', -np.inf] + "percent_features": Real(0.01, 1), + "threshold": ["mean", -np.inf], } - def __init__(self, number_features=None, n_estimators=10, max_depth=None, - percent_features=0.5, threshold=-np.inf, n_jobs=-1, random_seed=0, **kwargs): + def __init__( + self, + number_features=None, + n_estimators=10, + max_depth=None, + percent_features=0.5, + threshold=-np.inf, + n_jobs=-1, + random_seed=0, + **kwargs + ): - parameters = {"number_features": number_features, - "n_estimators": n_estimators, - "max_depth": max_depth, - "percent_features": percent_features, - "threshold": threshold, - "n_jobs": n_jobs} + parameters = { + "number_features": number_features, + "n_estimators": n_estimators, + "max_depth": max_depth, + "percent_features": percent_features, + "threshold": threshold, + "n_jobs": n_jobs, + } parameters.update(kwargs) - estimator = SKRandomForestRegressor(random_state=random_seed, - n_estimators=n_estimators, - max_depth=max_depth, - n_jobs=n_jobs) - max_features = max(1, int(percent_features * number_features)) if number_features else None - feature_selection = SkSelect(estimator=estimator, - max_features=max_features, - threshold=threshold, - **kwargs) - super().__init__(parameters=parameters, - component_obj=feature_selection, - random_seed=random_seed) + estimator = SKRandomForestRegressor( + random_state=random_seed, + n_estimators=n_estimators, + max_depth=max_depth, + n_jobs=n_jobs, + ) + max_features = ( + max(1, int(percent_features * number_features)) if number_features else None + ) + feature_selection = SkSelect( + estimator=estimator, + max_features=max_features, + threshold=threshold, + **kwargs + ) + super().__init__( + parameters=parameters, + component_obj=feature_selection, + random_seed=random_seed, + ) diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py index dba49021cb..3436a2123c 100644 --- a/evalml/pipelines/components/transformers/imputers/imputer.py +++ b/evalml/pipelines/components/transformers/imputers/imputer.py @@ -4,25 +4,32 @@ from evalml.pipelines.components.transformers.imputers import SimpleImputer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) class Imputer(Transformer): """Imputes missing data according to a specified imputation strategy.""" + name = "Imputer" hyperparameter_ranges = { "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["mean", "median", "most_frequent"] + "numeric_impute_strategy": ["mean", "median", "most_frequent"], } _valid_categorical_impute_strategies = set(["most_frequent", "constant"]) - _valid_numeric_impute_strategies = set(["mean", "median", "most_frequent", "constant"]) - - def __init__(self, categorical_impute_strategy="most_frequent", - categorical_fill_value=None, - numeric_impute_strategy="mean", - numeric_fill_value=None, - random_seed=0, **kwargs): + _valid_numeric_impute_strategies = set( + ["mean", "median", "most_frequent", "constant"] + ) + + def __init__( + self, + categorical_impute_strategy="most_frequent", + categorical_fill_value=None, + numeric_impute_strategy="mean", + numeric_fill_value=None, + random_seed=0, + **kwargs, + ): """Initalizes an transformer that imputes missing data according to the specified imputation strategy." Arguments: @@ -33,27 +40,37 @@ def __init__(self, categorical_impute_strategy="most_frequent", random_seed (int): Seed for the random number generator. Defaults to 0. """ if categorical_impute_strategy not in self._valid_categorical_impute_strategies: - raise ValueError(f"{categorical_impute_strategy} is an invalid parameter. Valid categorical impute strategies are {', '.join(self._valid_numeric_impute_strategies)}") + raise ValueError( + f"{categorical_impute_strategy} is an invalid parameter. Valid categorical impute strategies are {', '.join(self._valid_numeric_impute_strategies)}" + ) elif numeric_impute_strategy not in self._valid_numeric_impute_strategies: - raise ValueError(f"{numeric_impute_strategy} is an invalid parameter. Valid impute strategies are {', '.join(self._valid_numeric_impute_strategies)}") - - parameters = {"categorical_impute_strategy": categorical_impute_strategy, - "numeric_impute_strategy": numeric_impute_strategy, - "categorical_fill_value": categorical_fill_value, - "numeric_fill_value": numeric_fill_value} + raise ValueError( + f"{numeric_impute_strategy} is an invalid parameter. Valid impute strategies are {', '.join(self._valid_numeric_impute_strategies)}" + ) + + parameters = { + "categorical_impute_strategy": categorical_impute_strategy, + "numeric_impute_strategy": numeric_impute_strategy, + "categorical_fill_value": categorical_fill_value, + "numeric_fill_value": numeric_fill_value, + } parameters.update(kwargs) - self._categorical_imputer = SimpleImputer(impute_strategy=categorical_impute_strategy, - fill_value=categorical_fill_value, - **kwargs) - self._numeric_imputer = SimpleImputer(impute_strategy=numeric_impute_strategy, - fill_value=numeric_fill_value, - **kwargs) + self._categorical_imputer = SimpleImputer( + impute_strategy=categorical_impute_strategy, + fill_value=categorical_fill_value, + **kwargs, + ) + self._numeric_imputer = SimpleImputer( + impute_strategy=numeric_impute_strategy, + fill_value=numeric_fill_value, + **kwargs, + ) self._all_null_cols = None self._numeric_cols = None self._categorical_cols = None - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are @@ -67,10 +84,10 @@ def fit(self, X, y=None): self """ X = infer_feature_types(X) - cat_cols = list(X.ww.select(['category', 'boolean']).columns) - numeric_cols = list(X.ww.select(['numeric']).columns) + cat_cols = list(X.ww.select(["category", "boolean"]).columns) + numeric_cols = list(X.ww.select(["numeric"]).columns) - nan_ratio = X.ww.describe().loc['nan_count'] / X.shape[0] + nan_ratio = X.ww.describe().loc["nan_count"] / X.shape[0] self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist() X_numerics = X[[col for col in numeric_cols if col not in self._all_null_cols]] @@ -101,7 +118,7 @@ def transform(self, X, y=None): df = pd.DataFrame(index=X.index) return _retain_custom_types_and_initalize_woodwork(original_ltypes, df) - X.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore') + X.drop(self._all_null_cols, inplace=True, axis=1, errors="ignore") if self._numeric_cols is not None and len(self._numeric_cols) > 0: X_numeric = X[self._numeric_cols.tolist()] diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py index 5fd1b3bc0e..f3beadcec0 100644 --- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py @@ -1,19 +1,26 @@ from evalml.pipelines.components.transformers import Transformer from evalml.pipelines.components.transformers.imputers.simple_imputer import ( - SimpleImputer + SimpleImputer, ) from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) class PerColumnImputer(Transformer): """Imputes missing data according to a specified imputation strategy per column""" - name = 'Per Column Imputer' + + name = "Per Column Imputer" hyperparameter_ranges = {} - def __init__(self, impute_strategies=None, default_impute_strategy="most_frequent", random_seed=0, **kwargs): + def __init__( + self, + impute_strategies=None, + default_impute_strategy="most_frequent", + random_seed=0, + **kwargs + ): """Initializes a transformer that imputes missing data according to the specified imputation strategy per column." Arguments: @@ -30,18 +37,22 @@ def __init__(self, impute_strategies=None, default_impute_strategy="most_frequen random_seed (int): Seed for the random number generator. Defaults to 0. """ - parameters = {"impute_strategies": impute_strategies, - "default_impute_strategy": default_impute_strategy} + parameters = { + "impute_strategies": impute_strategies, + "default_impute_strategy": default_impute_strategy, + } self.imputers = None self.default_impute_strategy = default_impute_strategy self.impute_strategies = impute_strategies or dict() if not isinstance(self.impute_strategies, dict): - raise ValueError("`impute_strategies` is not a dictionary. Please provide in Column and {`impute_strategy`: strategy, `fill_value`:value} pairs. ") + raise ValueError( + "`impute_strategies` is not a dictionary. Please provide in Column and {`impute_strategy`: strategy, `fill_value`:value} pairs. " + ) - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): """Fits imputers on input data @@ -57,9 +68,13 @@ def fit(self, X, y=None): self.imputers = dict() for column in X.columns: strategy_dict = self.impute_strategies.get(column, dict()) - strategy = strategy_dict.get('impute_strategy', self.default_impute_strategy) - fill_value = strategy_dict.get('fill_value', None) - self.imputers[column] = SimpleImputer(impute_strategy=strategy, fill_value=fill_value) + strategy = strategy_dict.get( + "impute_strategy", self.default_impute_strategy + ) + fill_value = strategy_dict.get("fill_value", None) + self.imputers[column] = SimpleImputer( + impute_strategy=strategy, fill_value=fill_value + ) for column, imputer in self.imputers.items(): imputer.fit(X[[column]]) diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 7145131d4e..d4deb4b89d 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -5,16 +5,19 @@ from evalml.pipelines.components.transformers import Transformer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) class SimpleImputer(Transformer): """Imputes missing data according to a specified imputation strategy.""" - name = 'Simple Imputer' + + name = "Simple Imputer" hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} - def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs): + def __init__( + self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs + ): """Initalizes an transformer that imputes missing data according to the specified imputation strategy." Arguments: @@ -24,16 +27,13 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """ - parameters = {"impute_strategy": impute_strategy, - "fill_value": fill_value} + parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value} parameters.update(kwargs) - imputer = SkImputer(strategy=impute_strategy, - fill_value=fill_value, - **kwargs) + imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs) self._all_null_cols = None - super().__init__(parameters=parameters, - component_obj=imputer, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=imputer, random_seed=random_seed + ) def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are @@ -47,18 +47,20 @@ def fit(self, X, y=None): self """ X = infer_feature_types(X) - nan_ratio = X.ww.describe().loc['nan_count'] / X.shape[0] + nan_ratio = X.ww.describe().loc["nan_count"] / X.shape[0] self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist() # Not using select because we just need column names, not a new dataframe - natural_language_columns = [col for col, ltype in X.ww.logical_types.items() if ltype == NaturalLanguage] + natural_language_columns = [ + col for col, ltype in X.ww.logical_types.items() if ltype == NaturalLanguage + ] if natural_language_columns: X = X.ww.copy() X.ww.set_types({col: "Categorical" for col in natural_language_columns}) # Convert all bool dtypes to category for fitting if (X.dtypes == bool).all(): - X = X.astype('category') + X = X.astype("category") self._component_obj.fit(X, y) return self @@ -84,7 +86,13 @@ def transform(self, X, y=None): original_index = X.index # Not using select because we just need column names, not a new dataframe - X.ww.set_types({col: "Categorical" for col, ltype in X.ww.logical_types.items() if ltype == NaturalLanguage}) + X.ww.set_types( + { + col: "Categorical" + for col, ltype in X.ww.logical_types.items() + if ltype == NaturalLanguage + } + ) X = self._component_obj.transform(X) X = pd.DataFrame(X, columns=not_all_null_cols) diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py index ce916fe4ed..28b9a9bec6 100644 --- a/evalml/pipelines/components/transformers/imputers/target_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/target_imputer.py @@ -1,4 +1,3 @@ - from functools import wraps import pandas as pd @@ -9,7 +8,7 @@ from evalml.pipelines.components.transformers import Transformer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) @@ -19,24 +18,31 @@ class TargetImputerMeta(ComponentBaseMeta): @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the wrapped method if `True`. + It raises an exception if `False` and calls and returns the wrapped method if `True`. """ + @wraps(method) def _check_for_fit(self, X=None, y=None): klass = type(self).__name__ if not self._is_fitted and self.needs_fitting: - raise ComponentNotYetFittedError(f'This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.') + raise ComponentNotYetFittedError( + f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}." + ) else: return method(self, X, y) + return _check_for_fit class TargetImputer(Transformer, metaclass=TargetImputerMeta): """Imputes missing target data according to a specified imputation strategy.""" - name = 'Target Imputer' + + name = "Target Imputer" hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} - def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs): + def __init__( + self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs + ): """Initalizes an transformer that imputes missing target data according to the specified imputation strategy." Arguments: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for @@ -45,15 +51,12 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None, random_seed Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """ - parameters = {"impute_strategy": impute_strategy, - "fill_value": fill_value} + parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value} parameters.update(kwargs) - imputer = SkImputer(strategy=impute_strategy, - fill_value=fill_value, - **kwargs) - super().__init__(parameters=parameters, - component_obj=imputer, - random_seed=random_seed) + imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs) + super().__init__( + parameters=parameters, component_obj=imputer, random_seed=random_seed + ) def fit(self, X, y): """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are @@ -72,7 +75,7 @@ def fit(self, X, y): # Convert all bool dtypes to category for fitting if (y.dtypes == bool).all(): - y = y.astype('category') + y = y.astype("category") self._component_obj.fit(y) return self @@ -97,7 +100,9 @@ def transform(self, X, y): # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (y_df.dtypes == bool).all(): - return X, _retain_custom_types_and_initalize_woodwork(y_ww.ww.logical_type, y) + return X, _retain_custom_types_and_initalize_woodwork( + y_ww.ww.logical_type, y + ) transformed = self._component_obj.transform(y_df) if transformed.shape[1] == 0: diff --git a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py index 361dcf0520..2456a154c2 100644 --- a/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/datetime_featurizer.py @@ -1,7 +1,7 @@ from evalml.pipelines.components.transformers import Transformer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) @@ -9,8 +9,20 @@ def _extract_year(col, encode_as_categories=False): return col.dt.year, None -_month_to_int_mapping = {"January": 0, "February": 1, "March": 2, "April": 3, "May": 4, "June": 5, - "July": 6, "August": 7, "September": 8, "October": 9, "November": 10, "December": 11} +_month_to_int_mapping = { + "January": 0, + "February": 1, + "March": 2, + "April": 3, + "May": 4, + "June": 5, + "July": 6, + "August": 7, + "September": 8, + "October": 9, + "November": 10, + "December": 11, +} def _extract_month(col, encode_as_categories=False): @@ -22,8 +34,15 @@ def _extract_month(col, encode_as_categories=False): return months_encoded, {m: _month_to_int_mapping[m] for m in months_unique} -_day_to_int_mapping = {"Sunday": 0, "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, - "Saturday": 6} +_day_to_int_mapping = { + "Sunday": 0, + "Monday": 1, + "Tuesday": 2, + "Wednesday": 3, + "Thursday": 4, + "Friday": 5, + "Saturday": 6, +} def _extract_day_of_week(col, encode_as_categories=False): @@ -41,14 +60,24 @@ def _extract_hour(col, encode_as_categories=False): class DateTimeFeaturizer(Transformer): """Transformer that can automatically featurize DateTime columns.""" + name = "DateTime Featurization Component" hyperparameter_ranges = {} - _function_mappings = {"year": _extract_year, - "month": _extract_month, - "day_of_week": _extract_day_of_week, - "hour": _extract_hour} - - def __init__(self, features_to_extract=None, encode_as_categories=False, date_index=None, random_seed=0, **kwargs): + _function_mappings = { + "year": _extract_year, + "month": _extract_month, + "day_of_week": _extract_day_of_week, + "hour": _extract_hour, + } + + def __init__( + self, + features_to_extract=None, + encode_as_categories=False, + date_index=None, + random_seed=0, + **kwargs, + ): """Extracts features from DateTime columns Arguments: @@ -60,20 +89,28 @@ def __init__(self, features_to_extract=None, encode_as_categories=False, date_in """ if features_to_extract is None: features_to_extract = ["year", "month", "day_of_week", "hour"] - invalid_features = set(features_to_extract) - set(self._function_mappings.keys()) + invalid_features = set(features_to_extract) - set( + self._function_mappings.keys() + ) if len(invalid_features) > 0: - raise ValueError("{} are not valid options for features_to_extract".format(", ".join([f"'{feature}'" for feature in invalid_features]))) - - parameters = {"features_to_extract": features_to_extract, - "encode_as_categories": encode_as_categories, - "date_index": date_index} + raise ValueError( + "{} are not valid options for features_to_extract".format( + ", ".join([f"'{feature}'" for feature in invalid_features]) + ) + ) + + parameters = { + "features_to_extract": features_to_extract, + "encode_as_categories": encode_as_categories, + "date_index": date_index, + } parameters.update(kwargs) self._date_time_col_names = None self._categories = {} self.encode_as_categories = encode_as_categories - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): X = infer_feature_types(X) @@ -99,11 +136,15 @@ def transform(self, X, y=None): for col_name in self._date_time_col_names: for feature in features_to_extract: name = f"{col_name}_{feature}" - features, categories = self._function_mappings[feature](X[col_name], self.encode_as_categories) + features, categories = self._function_mappings[feature]( + X[col_name], self.encode_as_categories + ) X[name] = features if categories: self._categories[name] = categories - return _retain_custom_types_and_initalize_woodwork(original_ltypes, X.drop(columns=self._date_time_col_names)) + return _retain_custom_types_and_initalize_woodwork( + original_ltypes, X.drop(columns=self._date_time_col_names) + ) def get_feature_names(self): """Gets the categories of each datetime feature. @@ -118,6 +159,6 @@ def _get_feature_provenance(self): provenance = {} for col_name in self._date_time_col_names: provenance[col_name] = [] - for feature in self.parameters['features_to_extract']: - provenance[col_name].append(f'{col_name}_{feature}') + for feature in self.parameters["features_to_extract"]: + provenance[col_name].append(f"{col_name}_{feature}") return provenance diff --git a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py index 32423e3f0e..6232e6df80 100644 --- a/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/delayed_feature_transformer.py @@ -1,4 +1,3 @@ - import pandas as pd from sklearn.preprocessing import LabelEncoder, OrdinalEncoder from woodwork import logical_types @@ -9,11 +8,21 @@ class DelayedFeatureTransformer(Transformer): """Transformer that delayes input features and target variable for time series problems.""" + name = "Delayed Feature Transformer" hyperparameter_ranges = {} needs_fitting = False - def __init__(self, date_index=None, max_delay=2, delay_features=True, delay_target=True, gap=1, random_seed=0, **kwargs): + def __init__( + self, + date_index=None, + max_delay=2, + delay_features=True, + delay_target=True, + gap=1, + random_seed=0, + **kwargs, + ): """Creates a DelayedFeatureTransformer. Arguments: @@ -35,8 +44,13 @@ def __init__(self, date_index=None, max_delay=2, delay_features=True, delay_targ # If 0, start at 1 self.start_delay_for_target = int(gap == 0) - parameters = {"date_index": date_index, "max_delay": max_delay, "delay_target": delay_target, "delay_features": delay_features, - "gap": gap} + parameters = { + "date_index": date_index, + "max_delay": max_delay, + "delay_target": delay_target, + "delay_features": delay_features, + "gap": gap, + } parameters.update(kwargs) super().__init__(parameters=parameters, random_seed=random_seed) @@ -60,12 +74,19 @@ def _encode_y_while_preserving_index(y): @staticmethod def _get_categorical_columns(X): - return [name for name, column in X.ww.columns.items() if column.logical_type == logical_types.Categorical] + return [ + name + for name, column in X.ww.columns.items() + if column.logical_type == logical_types.Categorical + ] @staticmethod def _encode_X_while_preserving_index(X_categorical): - return pd.DataFrame(OrdinalEncoder().fit_transform(X_categorical), - columns=X_categorical.columns, index=X_categorical.index) + return pd.DataFrame( + OrdinalEncoder().fit_transform(X_categorical), + columns=X_categorical.columns, + index=X_categorical.index, + ) def transform(self, X, y=None): """Computes the delayed features for all features in X and y. @@ -91,7 +112,9 @@ def transform(self, X, y=None): X_ww = X_ww.ww.copy() categorical_columns = self._get_categorical_columns(X_ww) if self.delay_features and len(X) > 0: - X_categorical = self._encode_X_while_preserving_index(X_ww[categorical_columns]) + X_categorical = self._encode_X_while_preserving_index( + X_ww[categorical_columns] + ) for col_name in X_ww: col = X_ww[col_name] if col_name in categorical_columns: diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py index 3c6a5e25ef..94192c5a36 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_null_columns.py @@ -4,6 +4,7 @@ class DropNullColumns(Transformer): """Transformer to drop features whose percentage of NaN values exceeds a specified threshold""" + name = "Drop Null Columns Transformer" hyperparameter_ranges = {} @@ -17,14 +18,16 @@ def __init__(self, pct_null_threshold=1.0, random_seed=0, **kwargs): random_seed (int): Seed for the random number generator. Defaults to 0. """ if pct_null_threshold < 0 or pct_null_threshold > 1: - raise ValueError("pct_null_threshold must be a float between 0 and 1, inclusive.") + raise ValueError( + "pct_null_threshold must be a float between 0 and 1, inclusive." + ) parameters = {"pct_null_threshold": pct_null_threshold} parameters.update(kwargs) self._cols_to_drop = None - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): pct_null_threshold = self.parameters["pct_null_threshold"] diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py index 76ed64862c..6967a9807c 100644 --- a/evalml/pipelines/components/transformers/preprocessing/featuretools.py +++ b/evalml/pipelines/components/transformers/preprocessing/featuretools.py @@ -3,16 +3,17 @@ from evalml.pipelines.components.transformers.transformer import Transformer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) class DFSTransformer(Transformer): """Featuretools DFS component that generates features for pd.DataFrames""" + name = "DFS Transformer" hyperparameter_ranges = {} - def __init__(self, index='index', random_seed=0, **kwargs): + def __init__(self, index="index", random_seed=0, **kwargs): """Allows for featuretools to be used in EvalML. Arguments: @@ -27,16 +28,19 @@ def __init__(self, index='index', random_seed=0, **kwargs): self.index = index self.features = None parameters.update(kwargs) - super().__init__(parameters=parameters, - random_seed=random_seed) + super().__init__(parameters=parameters, random_seed=random_seed) def _make_entity_set(self, X): """Helper method that creates and returns the entity set given the input data""" ft_es = EntitySet() if self.index not in X.columns: - es = ft_es.entity_from_dataframe(entity_id="X", dataframe=X, index=self.index, make_index=True) + es = ft_es.entity_from_dataframe( + entity_id="X", dataframe=X, index=self.index, make_index=True + ) else: - es = ft_es.entity_from_dataframe(entity_id="X", dataframe=X, index=self.index) + es = ft_es.entity_from_dataframe( + entity_id="X", dataframe=X, index=self.index + ) return es def fit(self, X, y=None): @@ -52,10 +56,9 @@ def fit(self, X, y=None): X_ww = infer_feature_types(X) X_ww = X_ww.ww.rename({col: str(col) for col in X_ww.columns}) es = self._make_entity_set(X_ww) - self.features = dfs(entityset=es, - target_entity='X', - features_only=True, - max_depth=1) + self.features = dfs( + entityset=es, target_entity="X", features_only=True, max_depth=1 + ) return self def transform(self, X, y=None): @@ -72,4 +75,6 @@ def transform(self, X, y=None): X_ww = X_ww.ww.rename({col: str(col) for col in X_ww.columns}) es = self._make_entity_set(X_ww) feature_matrix = calculate_feature_matrix(features=self.features, entityset=es) - return _retain_custom_types_and_initalize_woodwork(X_ww.ww.logical_types, feature_matrix) + return _retain_custom_types_and_initalize_woodwork( + X_ww.ww.logical_types, feature_matrix + ) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index c69e958b09..8ce187e502 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -3,14 +3,13 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import make_pipeline -from evalml.pipelines.components.transformers.preprocessing import ( - TextTransformer -) +from evalml.pipelines.components.transformers.preprocessing import TextTransformer from evalml.utils import infer_feature_types class LSA(TextTransformer): """Transformer to calculate the Latent Semantic Analysis Values of text input""" + name = "LSA Transformer" hyperparameter_ranges = {} @@ -20,10 +19,11 @@ def __init__(self, random_seed=0, **kwargs): Arguments: random_seed (int): Seed for the random number generator. Defaults to 0. """ - self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_seed)) + self._lsa_pipeline = make_pipeline( + TfidfVectorizer(), TruncatedSVD(random_state=random_seed) + ) self._provenance = {} - super().__init__(random_seed=random_seed, - **kwargs) + super().__init__(random_seed=random_seed, **kwargs) def fit(self, X, y=None): X = infer_feature_types(X) @@ -55,9 +55,13 @@ def transform(self, X, y=None): provenance = {} for col in self._text_columns: transformed = self._lsa_pipeline.transform(X_ww[col]) - X_ww.ww['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0], index=X_ww.index) - X_ww.ww['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1], index=X_ww.index) - provenance[col] = ['LSA({})[0]'.format(col), 'LSA({})[1]'.format(col)] + X_ww.ww["LSA({})[0]".format(col)] = pd.Series( + transformed[:, 0], index=X_ww.index + ) + X_ww.ww["LSA({})[1]".format(col)] = pd.Series( + transformed[:, 1], index=X_ww.index + ) + provenance[col] = ["LSA({})[0]".format(col), "LSA({})[1]".format(col)] self._provenance = provenance X_t = X_ww.ww.drop(columns=self._text_columns) diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py index 62da23a797..851a73c1e6 100644 --- a/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py +++ b/evalml/pipelines/components/transformers/preprocessing/polynomial_detrender.py @@ -7,6 +7,7 @@ class PolynomialDetrender(Transformer): """Removes trends from time series by fitting a polynomial to the data.""" + name = "Polynomial Detrender" hyperparameter_ranges = {"degree": Integer(1, 3)} @@ -23,20 +24,24 @@ def __init__(self, degree=1, random_seed=0, **kwargs): if isinstance(degree, float) and degree.is_integer(): degree = int(degree) else: - raise TypeError(f"Parameter Degree must be an integer!: Received {type(degree).__name__}") + raise TypeError( + f"Parameter Degree must be an integer!: Received {type(degree).__name__}" + ) params = {"degree": degree} params.update(kwargs) error_msg = "sktime is not installed. Please install using 'pip install sktime'" trend = import_or_raise("sktime.forecasting.trend", error_msg=error_msg) - detrend = import_or_raise("sktime.transformations.series.detrend", error_msg=error_msg) + detrend = import_or_raise( + "sktime.transformations.series.detrend", error_msg=error_msg + ) detrender = detrend.Detrender(trend.PolynomialTrendForecaster(degree=degree)) - super().__init__(parameters=params, - component_obj=detrender, - random_seed=random_seed) + super().__init__( + parameters=params, component_obj=detrender, random_seed=random_seed + ) def fit(self, X, y=None): """Fits the PolynomialDetrender. diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 563b682a8c..8bd8b693fe 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -3,15 +3,13 @@ import featuretools as ft import nlp_primitives -from evalml.pipelines.components.transformers.preprocessing import ( - LSA, - TextTransformer -) +from evalml.pipelines.components.transformers.preprocessing import LSA, TextTransformer from evalml.utils import infer_feature_types class TextFeaturizer(TextTransformer): """Transformer that can automatically featurize text columns.""" + name = "Text Featurization Component" hyperparameter_ranges = {} @@ -21,20 +19,21 @@ def __init__(self, random_seed=0, **kwargs): Arguments: random_seed (int): Seed for the random number generator. Defaults to 0. """ - self._trans = [nlp_primitives.DiversityScore, - nlp_primitives.MeanCharactersPerWord, - nlp_primitives.PolarityScore] + self._trans = [ + nlp_primitives.DiversityScore, + nlp_primitives.MeanCharactersPerWord, + nlp_primitives.PolarityScore, + ] self._features = None self._lsa = LSA(random_seed=random_seed) self._primitives_provenance = {} - super().__init__(random_seed=random_seed, - **kwargs) + super().__init__(random_seed=random_seed, **kwargs) def _clean_text(self, X): """Remove all non-alphanum chars other than spaces, and make lowercase""" def normalize(text): - text = text.translate(str.maketrans('', '', string.punctuation)) + text = text.translate(str.maketrans("", "", string.punctuation)) return text.lower() for col_name in X.columns: @@ -49,11 +48,18 @@ def _make_entity_set(self, X, text_columns): # featuretools expects str-type column names X_text.rename(columns=str, inplace=True) - all_text_variable_types = {col_name: 'natural_language' for col_name in X_text.columns} + all_text_variable_types = { + col_name: "natural_language" for col_name in X_text.columns + } es = ft.EntitySet() - es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index', make_index=True, - variable_types=all_text_variable_types) + es.entity_from_dataframe( + entity_id="X", + dataframe=X_text, + index="index", + make_index=True, + variable_types=all_text_variable_types, + ) return es def fit(self, X, y=None): @@ -74,11 +80,13 @@ def fit(self, X, y=None): self._lsa.fit(X) es = self._make_entity_set(X, self._text_columns) - self._features = ft.dfs(entityset=es, - target_entity='X', - trans_primitives=self._trans, - max_depth=1, - features_only=True) + self._features = ft.dfs( + entityset=es, + target_entity="X", + trans_primitives=self._trans, + max_depth=1, + features_only=True, + ) return self @staticmethod @@ -108,7 +116,9 @@ def transform(self, X, y=None): if self._features is None or len(self._features) == 0: return X_ww es = self._make_entity_set(X_ww, self._text_columns) - X_nlp_primitives = ft.calculate_feature_matrix(features=self._features, entityset=es) + X_nlp_primitives = ft.calculate_feature_matrix( + features=self._features, entityset=es + ) if X_nlp_primitives.isnull().any().any(): X_nlp_primitives.fillna(0, inplace=True) diff --git a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py index 200017bebb..3948b66de1 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_transformer.py @@ -16,10 +16,10 @@ def __init__(self, component_obj=None, random_seed=0, **kwargs): parameters = {} parameters.update(kwargs) - super().__init__(parameters=parameters, - component_obj=component_obj, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=component_obj, random_seed=random_seed + ) def _get_text_columns(self, X): """Returns the ordered list of columns names in the input which have been designated as text columns.""" - return list(X.ww.select('NaturalLanguage').columns) + return list(X.ww.select("NaturalLanguage").columns) diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index 04399a8092..0208e49243 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -69,7 +69,9 @@ def _convert_dictionary(self, sampling_dict, y): # check that the lengths of the dict and y are equal y_unique = y.unique() if len(sampling_dict) != len(y_unique): - raise ValueError("Sampling dictionary contains a different number of targets than are provided in the data.") + raise ValueError( + "Sampling dictionary contains a different number of targets than are provided in the data." + ) if len(set(sampling_dict.keys()).intersection(set(y_unique))) != len(y_unique): raise ValueError("Dictionary keys are different from target values!") @@ -100,16 +102,27 @@ def _dictionary_to_params(self, sampling_dict, y): dict: The parameters dictionary with the sampling_ratio_dict value replaced as necessary """ param_copy = copy.copy(self.parameters) - if self.parameters['sampling_ratio_dict']: - new_dic = self._convert_dictionary(self.parameters['sampling_ratio_dict'], y) - param_copy['sampling_ratio_dict'] = new_dic + if self.parameters["sampling_ratio_dict"]: + new_dic = self._convert_dictionary( + self.parameters["sampling_ratio_dict"], y + ) + param_copy["sampling_ratio_dict"] = new_dic return param_copy class BaseOverSampler(BaseSampler): """Base Oversampler component. Used as the base class of all imbalance-learn oversampler components""" - def __init__(self, sampler, sampling_ratio=0.25, sampling_ratio_dict=None, k_neighbors=5, n_jobs=-1, random_seed=0, **kwargs): + def __init__( + self, + sampler, + sampling_ratio=0.25, + sampling_ratio_dict=None, + k_neighbors=5, + n_jobs=-1, + random_seed=0, + **kwargs + ): """Initializes the oversampler component. Arguments: @@ -121,17 +134,19 @@ def __init__(self, sampler, sampling_ratio=0.25, sampling_ratio_dict=None, k_nei """ error_msg = "imbalanced-learn is not installed. Please install using 'pip install imbalanced-learn'" im = import_or_raise("imblearn.over_sampling", error_msg=error_msg) - parameters = {"sampling_ratio": sampling_ratio, - "k_neighbors": k_neighbors, - "n_jobs": n_jobs, - "sampling_ratio_dict": sampling_ratio_dict} + parameters = { + "sampling_ratio": sampling_ratio, + "k_neighbors": k_neighbors, + "n_jobs": n_jobs, + "sampling_ratio_dict": sampling_ratio_dict, + } parameters.update(kwargs) - self.sampler = {"SMOTE": im.SMOTE, - "SMOTENC": im.SMOTENC, - "SMOTEN": im.SMOTEN}[sampler] - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + self.sampler = {"SMOTE": im.SMOTE, "SMOTENC": im.SMOTENC, "SMOTEN": im.SMOTEN}[ + sampler + ] + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y): """Fits the Oversampler to the data. @@ -156,15 +171,19 @@ def _initialize_oversampler(self, X, y, sampler_class): sampler_class (imblearn.BaseSampler): The sampler we want to initialize """ _, y_ww = self._prepare_data(X, y) - sampler_params = {k: v for k, v in copy.copy(self.parameters).items() if k not in ['sampling_ratio', 'sampling_ratio_dict']} - if self.parameters['sampling_ratio_dict'] is not None: + sampler_params = { + k: v + for k, v in copy.copy(self.parameters).items() + if k not in ["sampling_ratio", "sampling_ratio_dict"] + } + if self.parameters["sampling_ratio_dict"] is not None: # make the dictionary - dic = self._convert_dictionary(self.parameters['sampling_ratio_dict'], y_ww) + dic = self._convert_dictionary(self.parameters["sampling_ratio_dict"], y_ww) else: # create the sampling dictionary - sampling_ratio = self.parameters['sampling_ratio'] + sampling_ratio = self.parameters["sampling_ratio"] dic = make_balancing_dictionary(y_ww, sampling_ratio) - sampler_params['sampling_strategy'] = dic + sampler_params["sampling_strategy"] = dic sampler = sampler_class(**sampler_params, random_state=self.random_seed) self._component_obj = sampler diff --git a/evalml/pipelines/components/transformers/samplers/oversamplers.py b/evalml/pipelines/components/transformers/samplers/oversamplers.py index 64a6274eed..7dad056e2f 100644 --- a/evalml/pipelines/components/transformers/samplers/oversamplers.py +++ b/evalml/pipelines/components/transformers/samplers/oversamplers.py @@ -1,42 +1,56 @@ from evalml.pipelines.components.transformers.samplers.base_sampler import ( - BaseOverSampler + BaseOverSampler, ) from evalml.utils.woodwork_utils import infer_feature_types class SMOTESampler(BaseOverSampler): """SMOTE Oversampler component. Works on numerical datasets only. This component is only run during training and not during predict.""" + name = "SMOTE Oversampler" hyperparameter_ranges = {} - def __init__(self, sampling_ratio=0.25, k_neighbors=5, n_jobs=-1, random_seed=0, **kwargs): - super().__init__("SMOTE", - sampling_ratio=sampling_ratio, - k_neighbors=k_neighbors, - n_jobs=n_jobs, - random_seed=random_seed, - **kwargs) + def __init__( + self, sampling_ratio=0.25, k_neighbors=5, n_jobs=-1, random_seed=0, **kwargs + ): + super().__init__( + "SMOTE", + sampling_ratio=sampling_ratio, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + random_seed=random_seed, + **kwargs + ) class SMOTENCSampler(BaseOverSampler): """SMOTENC Oversampler component. Uses SMOTENC to generate synthetic samples. Works on a mix of nomerical and categorical columns. - Input data must be Woodwork type, and this component is only run during training and not during predict.""" + Input data must be Woodwork type, and this component is only run during training and not during predict.""" + name = "SMOTENC Oversampler" hyperparameter_ranges = {} - def __init__(self, sampling_ratio=0.25, k_neighbors=5, n_jobs=-1, random_seed=0, **kwargs): + def __init__( + self, sampling_ratio=0.25, k_neighbors=5, n_jobs=-1, random_seed=0, **kwargs + ): self.categorical_features = None - super().__init__("SMOTENC", - sampling_ratio=sampling_ratio, - k_neighbors=k_neighbors, - n_jobs=n_jobs, - random_seed=random_seed, - **kwargs) + super().__init__( + "SMOTENC", + sampling_ratio=sampling_ratio, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + random_seed=random_seed, + **kwargs + ) def _get_categorical(self, X): X = infer_feature_types(X) - self.categorical_features = [i for i, val in enumerate(X.ww.types['Logical Type'].items()) if str(val[1]) in {'Boolean', 'Categorical'}] - self._parameters['categorical_features'] = self.categorical_features + self.categorical_features = [ + i + for i, val in enumerate(X.ww.types["Logical Type"].items()) + if str(val[1]) in {"Boolean", "Categorical"} + ] + self._parameters["categorical_features"] = self.categorical_features def fit(self, X, y): # get categorical features first @@ -46,14 +60,19 @@ def fit(self, X, y): class SMOTENSampler(BaseOverSampler): """SMOTEN Oversampler component. Uses SMOTEN to generate synthetic samples. Works for purely categorical datasets. - This component is only run during training and not during predict.""" + This component is only run during training and not during predict.""" + name = "SMOTEN Oversampler" hyperparameter_ranges = {} - def __init__(self, sampling_ratio=0.25, k_neighbors=5, n_jobs=-1, random_seed=0, **kwargs): - super().__init__("SMOTEN", - sampling_ratio=sampling_ratio, - k_neighbors=k_neighbors, - n_jobs=n_jobs, - random_seed=random_seed, - **kwargs) + def __init__( + self, sampling_ratio=0.25, k_neighbors=5, n_jobs=-1, random_seed=0, **kwargs + ): + super().__init__( + "SMOTEN", + sampling_ratio=sampling_ratio, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + random_seed=random_seed, + **kwargs + ) diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py index c67968c598..98ee10783e 100644 --- a/evalml/pipelines/components/transformers/samplers/undersampler.py +++ b/evalml/pipelines/components/transformers/samplers/undersampler.py @@ -1,19 +1,26 @@ import pandas as pd -from evalml.pipelines.components.transformers.samplers.base_sampler import ( - BaseSampler -) +from evalml.pipelines.components.transformers.samplers.base_sampler import BaseSampler from evalml.preprocessing.data_splitters.balanced_classification_sampler import ( - BalancedClassificationSampler + BalancedClassificationSampler, ) class Undersampler(BaseSampler): """Random undersampler component. This component is only run during training and not during predict.""" + name = "Undersampler" hyperparameter_ranges = {} - def __init__(self, sampling_ratio=0.25, sampling_ratio_dict=None, min_samples=100, min_percentage=0.1, random_seed=0, **kwargs): + def __init__( + self, + sampling_ratio=0.25, + sampling_ratio_dict=None, + min_samples=100, + min_percentage=0.1, + random_seed=0, + **kwargs + ): """Initializes an undersampling transformer to downsample the majority classes in the dataset. Arguments: @@ -30,15 +37,17 @@ def __init__(self, sampling_ratio=0.25, sampling_ratio_dict=None, min_samples=10 Must be between 0 and 0.5, inclusive. Defaults to 0.1. random_seed (int): The seed to use for random sampling. Defaults to 0. """ - parameters = {"sampling_ratio": sampling_ratio, - "min_samples": min_samples, - "min_percentage": min_percentage, - "sampling_ratio_dict": sampling_ratio_dict} + parameters = { + "sampling_ratio": sampling_ratio, + "min_samples": min_samples, + "min_percentage": min_percentage, + "sampling_ratio_dict": sampling_ratio_dict, + } parameters.update(kwargs) - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def _initialize_undersampler(self, y): """Helper function to initialize the undersampler component object. @@ -46,8 +55,12 @@ def _initialize_undersampler(self, y): Arguments: y (pd.Series): The target data """ - param_dic = self._dictionary_to_params(self.parameters['sampling_ratio_dict'], y) - sampler = BalancedClassificationSampler(**param_dic, random_seed=self.random_seed) + param_dic = self._dictionary_to_params( + self.parameters["sampling_ratio_dict"], y + ) + sampler = BalancedClassificationSampler( + **param_dic, random_seed=self.random_seed + ) self._component_obj = sampler def fit_transform(self, X, y): diff --git a/evalml/pipelines/components/transformers/scalers/standard_scaler.py b/evalml/pipelines/components/transformers/scalers/standard_scaler.py index e8fa36f981..b79b9842be 100644 --- a/evalml/pipelines/components/transformers/scalers/standard_scaler.py +++ b/evalml/pipelines/components/transformers/scalers/standard_scaler.py @@ -5,12 +5,13 @@ from evalml.pipelines.components.transformers import Transformer from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) class StandardScaler(Transformer): """Standardize features: removes mean and scales to unit variance.""" + name = "Standard Scaler" hyperparameter_ranges = {} @@ -19,19 +20,21 @@ def __init__(self, random_seed=0, **kwargs): parameters.update(kwargs) scaler = SkScaler(**parameters) - super().__init__(parameters=parameters, - component_obj=scaler, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=scaler, random_seed=random_seed + ) def transform(self, X, y=None): X = infer_feature_types(X) original_ltypes = X.ww.schema.logical_types - X = X.ww.select_dtypes(exclude=['datetime']) + X = X.ww.select_dtypes(exclude=["datetime"]) X_t = self._component_obj.transform(X) X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index) - return _retain_custom_types_and_initalize_woodwork(original_ltypes, X_t_df, ltypes_to_ignore=[Integer, Categorical, Boolean]) + return _retain_custom_types_and_initalize_woodwork( + original_ltypes, X_t_df, ltypes_to_ignore=[Integer, Categorical, Boolean] + ) def fit_transform(self, X, y=None): X = infer_feature_types(X) - X = X.select_dtypes(exclude=['datetime']) + X = X.select_dtypes(exclude=["datetime"]) return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/transformers/transformer.py b/evalml/pipelines/components/transformers/transformer.py index 38bad0ff80..3e706ac626 100644 --- a/evalml/pipelines/components/transformers/transformer.py +++ b/evalml/pipelines/components/transformers/transformer.py @@ -5,7 +5,7 @@ from evalml.pipelines.components import ComponentBase from evalml.utils import ( _retain_custom_types_and_initalize_woodwork, - infer_feature_types + infer_feature_types, ) @@ -40,9 +40,13 @@ def transform(self, X, y=None): try: X_t = self._component_obj.transform(X, y) except AttributeError: - raise MethodPropertyNotFoundError("Transformer requires a transform method or a component_obj that implements transform") + raise MethodPropertyNotFoundError( + "Transformer requires a transform method or a component_obj that implements transform" + ) X_t_df = pd.DataFrame(X_t, columns=X_ww.columns, index=X_ww.index) - return _retain_custom_types_and_initalize_woodwork(X_ww.ww.logical_types, X_t_df) + return _retain_custom_types_and_initalize_woodwork( + X_ww.ww.logical_types, X_t_df + ) def fit_transform(self, X, y=None): """Fits on X and transforms X @@ -59,7 +63,9 @@ def fit_transform(self, X, y=None): y_ww = infer_feature_types(y) try: X_t = self._component_obj.fit_transform(X_ww, y_ww) - return _retain_custom_types_and_initalize_woodwork(X_ww.ww.logical_types, X_t) + return _retain_custom_types_and_initalize_woodwork( + X_ww.ww.logical_types, X_t + ) except AttributeError: try: return self.fit(X, y).transform(X, y) diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index ef25a0c6af..934ffb723c 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -44,7 +44,10 @@ def allowed_model_families(problem_type): estimators = [] problem_type = handle_problem_types(problem_type) for estimator in _all_estimators_used_in_search(): - if problem_type in set(handle_problem_types(problem) for problem in estimator.supported_problem_types): + if problem_type in set( + handle_problem_types(problem) + for problem in estimator.supported_problem_types + ): estimators.append(estimator) return list(set([e.model_family for e in estimators])) @@ -68,15 +71,23 @@ def get_estimators(problem_type, model_families=None): if model_families is None: model_families = allowed_model_families(problem_type) - model_families = [handle_model_family(model_family) for model_family in model_families] + model_families = [ + handle_model_family(model_family) for model_family in model_families + ] all_model_families = allowed_model_families(problem_type) for model_family in model_families: if model_family not in all_model_families: - raise RuntimeError("Unrecognized model type for problem type %s: %s" % (problem_type, model_family)) + raise RuntimeError( + "Unrecognized model type for problem type %s: %s" + % (problem_type, model_family) + ) estimator_classes = [] for estimator_class in _all_estimators_used_in_search(): - if problem_type not in [handle_problem_types(supported_pt) for supported_pt in estimator_class.supported_problem_types]: + if problem_type not in [ + handle_problem_types(supported_pt) + for supported_pt in estimator_class.supported_problem_types + ]: continue if estimator_class.model_family not in model_families: continue @@ -100,11 +111,16 @@ def handle_component_class(component_class): if inspect.isclass(component_class) and issubclass(component_class, ComponentBase): return component_class if not isinstance(component_class, str): - raise ValueError(("component_graph may only contain str or ComponentBase subclasses, not '{}'") - .format(type(component_class))) + raise ValueError( + ( + "component_graph may only contain str or ComponentBase subclasses, not '{}'" + ).format(type(component_class)) + ) component_classes = {component.name: component for component in all_components()} if component_class not in component_classes: - raise MissingComponentError('Component "{}" was not found'.format(component_class)) + raise MissingComponentError( + 'Component "{}" was not found'.format(component_class) + ) component_class = component_classes[component_class] return component_class @@ -152,7 +168,7 @@ def predict(self, X): Returns: np.ndarray: Predicted values """ - check_is_fitted(self, 'is_fitted_') + check_is_fitted(self, "is_fitted_") return self.pipeline.predict(X).to_numpy() @@ -214,16 +230,29 @@ def scikit_learn_wrapped_estimator(evalml_obj): """Wrap an EvalML pipeline or estimator in a scikit-learn estimator.""" if isinstance(evalml_obj, PipelineBase): - if evalml_obj.problem_type in [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]: + if evalml_obj.problem_type in [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ]: return WrappedSKRegressor(evalml_obj) - elif evalml_obj.problem_type == ProblemTypes.BINARY or evalml_obj.problem_type == ProblemTypes.MULTICLASS: + elif ( + evalml_obj.problem_type == ProblemTypes.BINARY + or evalml_obj.problem_type == ProblemTypes.MULTICLASS + ): return WrappedSKClassifier(evalml_obj) else: # EvalML Estimator - if evalml_obj.supported_problem_types == [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]: + if evalml_obj.supported_problem_types == [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ]: return WrappedSKRegressor(evalml_obj) - elif evalml_obj.supported_problem_types == [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS]: + elif evalml_obj.supported_problem_types == [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ]: return WrappedSKClassifier(evalml_obj) raise ValueError("Could not wrap EvalML object in scikit-learn wrapper.") @@ -243,16 +272,21 @@ def generate_component_code(element): base_string = "" if not isinstance(element, ComponentBase): - raise ValueError("Element must be a component instance, received {}".format(type(element))) + raise ValueError( + "Element must be a component instance, received {}".format(type(element)) + ) if element.__class__ in all_components(): - code_strings.append("from {} import {}\n".format(element.__class__.__module__, element.__class__.__name__)) + code_strings.append( + "from {} import {}\n".format( + element.__class__.__module__, element.__class__.__name__ + ) + ) component_parameters = element.parameters - name = element.name[0].lower() + element.name[1:].replace(' ', '') - base_string += "{0} = {1}(**{2})" \ - .format(name, - element.__class__.__name__, - component_parameters) + name = element.name[0].lower() + element.name[1:].replace(" ", "") + base_string += "{0} = {1}(**{2})".format( + name, element.__class__.__name__, component_parameters + ) code_strings.append(base_string) return "\n".join(code_strings) @@ -272,7 +306,9 @@ def make_balancing_dictionary(y, sampling_ratio): for each class that will satisfy sampling_ratio. """ if sampling_ratio <= 0 or sampling_ratio > 1: - raise ValueError("Sampling ratio must be in range (0, 1], received {}".format(sampling_ratio)) + raise ValueError( + "Sampling ratio must be in range (0, 1], received {}".format(sampling_ratio) + ) if len(y) == 0: raise ValueError("Target data must not be empty") value_counts = y.value_counts() diff --git a/evalml/pipelines/multiclass_classification_pipeline.py b/evalml/pipelines/multiclass_classification_pipeline.py index 171c930ee5..4b319d5019 100644 --- a/evalml/pipelines/multiclass_classification_pipeline.py +++ b/evalml/pipelines/multiclass_classification_pipeline.py @@ -4,4 +4,5 @@ class MulticlassClassificationPipeline(ClassificationPipeline): """Pipeline subclass for all multiclass classification pipelines.""" + problem_type = ProblemTypes.MULTICLASS diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index af14cd80b6..29f84eb419 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -15,7 +15,7 @@ Estimator, LinearDiscriminantAnalysis, StackedEnsembleClassifier, - StackedEnsembleRegressor + StackedEnsembleRegressor, ) from .components.utils import all_components, handle_component_class @@ -31,7 +31,7 @@ jupyter_check, log_subtitle, log_title, - safe_repr + safe_repr, ) logger = get_logger(__file__) @@ -42,11 +42,9 @@ class PipelineBase(ABC, metaclass=PipelineBaseMeta): problem_type = None - def __init__(self, - component_graph, - parameters=None, - custom_name=None, - random_seed=0): + def __init__( + self, component_graph, parameters=None, custom_name=None, random_seed=0 + ): """Machine learning pipeline made out of transformers and a estimator. Arguments: @@ -64,9 +62,13 @@ def __init__(self, self.component_graph = component_graph if isinstance(component_graph, list): # Backwards compatibility - self._component_graph = ComponentGraph().from_list(component_graph, random_seed=self.random_seed) + self._component_graph = ComponentGraph().from_list( + component_graph, random_seed=self.random_seed + ) else: - self._component_graph = ComponentGraph(component_dict=component_graph, random_seed=self.random_seed) + self._component_graph = ComponentGraph( + component_dict=component_graph, random_seed=self.random_seed + ) self._component_graph.instantiate(parameters) self.input_feature_names = {} @@ -75,8 +77,14 @@ def __init__(self, self.estimator = None if len(self._component_graph.compute_order) > 0: final_component = self._component_graph.get_last_component() - self.estimator = final_component if isinstance(final_component, Estimator) else None - self._estimator_name = self._component_graph.compute_order[-1] if self.estimator is not None else None + self.estimator = ( + final_component if isinstance(final_component, Estimator) else None + ) + self._estimator_name = ( + self._component_graph.compute_order[-1] + if self.estimator is not None + else None + ) self._validate_estimator_problem_type() self._is_fitted = False @@ -102,19 +110,24 @@ def summary(self): """A short summary of the pipeline structure, describing the list of components used. Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder """ - component_graph = [handle_component_class(component_class) for _, component_class in copy.copy(self.linearized_component_graph)] + component_graph = [ + handle_component_class(component_class) + for _, component_class in copy.copy(self.linearized_component_graph) + ] if len(component_graph) == 0: return "Empty Pipeline" summary = "Pipeline" component_graph[-1] = component_graph[-1] - if inspect.isclass(component_graph[-1]) and issubclass(component_graph[-1], Estimator): + if inspect.isclass(component_graph[-1]) and issubclass( + component_graph[-1], Estimator + ): estimator_class = component_graph.pop(-1) summary = estimator_class.name if len(component_graph) == 0: return summary component_names = [component_class.name for component_class in component_graph] - return '{} w/ {}'.format(summary, ' + '.join(component_names)) + return "{} w/ {}".format(summary, " + ".join(component_names)) @property def linearized_component_graph(self): @@ -123,20 +136,25 @@ def linearized_component_graph(self): def _validate_estimator_problem_type(self): """Validates this pipeline's problem_type against that of the estimator from `self.component_graph`""" - if self.estimator is None: # Allow for pipelines that do not end with an estimator + if ( + self.estimator is None + ): # Allow for pipelines that do not end with an estimator return estimator_problem_types = self.estimator.supported_problem_types if self.problem_type not in estimator_problem_types: - raise ValueError("Problem type {} not valid for this component graph. Valid problem types include {}." - .format(self.problem_type, estimator_problem_types)) + raise ValueError( + "Problem type {} not valid for this component graph. Valid problem types include {}.".format( + self.problem_type, estimator_problem_types + ) + ) def __getitem__(self, index): if isinstance(index, slice): - raise NotImplementedError('Slicing pipelines is currently not supported.') + raise NotImplementedError("Slicing pipelines is currently not supported.") return self._component_graph[index] def __setitem__(self, index, value): - raise NotImplementedError('Setting pipeline components is not supported.') + raise NotImplementedError("Setting pipeline components is not supported.") def get_component(self, name): """Returns component by name @@ -164,7 +182,11 @@ def describe(self, return_dict=False): logger.info("Model Family: {}".format(str(self.model_family))) if self._estimator_name in self.input_feature_names: - logger.info("Number of features: {}".format(len(self.input_feature_names[self._estimator_name]))) + logger.info( + "Number of features: {}".format( + len(self.input_feature_names[self._estimator_name]) + ) + ) # Summary of steps log_subtitle(logger, "Pipeline Steps") @@ -173,13 +195,19 @@ def describe(self, return_dict=False): "name": self.name, "problem_type": self.problem_type, "model_family": self.model_family, - "components": dict() + "components": dict(), } for number, component in enumerate(self._component_graph, 1): component_string = str(number) + ". " + component.name logger.info(component_string) - pipeline_dict["components"].update({component.name: component.describe(print_name=False, return_dict=return_dict)}) + pipeline_dict["components"].update( + { + component.name: component.describe( + print_name=False, return_dict=return_dict + ) + } + ) if return_dict: return pipeline_dict @@ -271,9 +299,18 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): for objective in objectives: try: if not objective.is_defined_for_problem_type(self.problem_type): - raise ValueError(f'Invalid objective {objective.name} specified for problem type {self.problem_type}') - y_pred = self._select_y_pred_for_score(X, y, y_pred, y_pred_proba, objective) - score = self._score(X, y, y_pred_proba if objective.score_needs_proba else y_pred, objective) + raise ValueError( + f"Invalid objective {objective.name} specified for problem type {self.problem_type}" + ) + y_pred = self._select_y_pred_for_score( + X, y, y_pred, y_pred_proba, objective + ) + score = self._score( + X, + y, + y_pred_proba if objective.score_needs_proba else y_pred, + objective, + ) scored_successfully.update({objective.name: score}) except Exception as e: tb = traceback.format_tb(sys.exc_info()[2]) @@ -296,7 +333,9 @@ def model_family(self): else: order = ComponentGraph.generate_order(component_graph.component_dict) final_component = order[-1] - return handle_component_class(component_graph[final_component].__class__).model_family + return handle_component_class( + component_graph[final_component].__class__ + ).model_family @property def parameters(self): @@ -305,10 +344,15 @@ def parameters(self): Returns: dict: Dictionary of all component parameters """ - components = [(component_name, component_class) for component_name, component_class in self._component_graph.component_instances.items()] - component_parameters = {c_name: copy.copy(c.parameters) for c_name, c in components if c.parameters} + components = [ + (component_name, component_class) + for component_name, component_class in self._component_graph.component_instances.items() + ] + component_parameters = { + c_name: copy.copy(c.parameters) for c_name, c in components if c.parameters + } if self._pipeline_params: - component_parameters['pipeline'] = self._pipeline_params + component_parameters["pipeline"] = self._pipeline_params return component_parameters @property @@ -333,7 +377,9 @@ def feature_importance(self): pd.DataFrame including feature names and their corresponding importance """ feature_names = self.input_feature_names[self._estimator_name] - importance = list(zip(feature_names, self.estimator.feature_importance)) # note: this only works for binary + importance = list( + zip(feature_names, self.estimator.feature_importance) + ) # note: this only works for binary importance.sort(key=lambda x: -abs(x[1])) df = pd.DataFrame(importance, columns=["feature", "importance"]) return df @@ -347,18 +393,20 @@ def graph(self, filepath=None): Returns: graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks. """ - graphviz = import_or_raise('graphviz', error_msg='Please install graphviz to visualize pipelines.') + graphviz = import_or_raise( + "graphviz", error_msg="Please install graphviz to visualize pipelines." + ) # Try rendering a dummy graph to see if a working backend is installed try: graphviz.Digraph().pipe() except graphviz.backend.ExecutableNotFound: raise RuntimeError( - "To graph pipelines, a graphviz backend is required.\n" + - "Install the backend using one of the following commands:\n" + - " Mac OS: brew install graphviz\n" + - " Linux (Ubuntu): sudo apt-get install graphviz\n" + - " Windows: conda install python-graphviz\n" + "To graph pipelines, a graphviz backend is required.\n" + + "Install the backend using one of the following commands:\n" + + " Mac OS: brew install graphviz\n" + + " Linux (Ubuntu): sudo apt-get install graphviz\n" + + " Windows: conda install python-graphviz\n" ) graph_format = None @@ -367,16 +415,22 @@ def graph(self, filepath=None): # Explicitly cast to str in case a Path object was passed in filepath = str(filepath) try: - f = open(filepath, 'w') + f = open(filepath, "w") f.close() except (IOError, FileNotFoundError): - raise ValueError(('Specified filepath is not writeable: {}'.format(filepath))) + raise ValueError( + ("Specified filepath is not writeable: {}".format(filepath)) + ) path_and_name, graph_format = os.path.splitext(filepath) graph_format = graph_format[1:].lower() # ignore the dot supported_filetypes = graphviz.backend.FORMATS if graph_format not in supported_filetypes: - raise ValueError(("Unknown format '{}'. Make sure your format is one of the " + - "following: {}").format(graph_format, supported_filetypes)) + raise ValueError( + ( + "Unknown format '{}'. Make sure your format is one of the " + + "following: {}" + ).format(graph_format, supported_filetypes) + ) graph = self._component_graph.graph(path_and_name, graph_format) @@ -394,38 +448,39 @@ def graph_feature_importance(self, importance_threshold=0): Returns: plotly.Figure, a bar graph showing features and their corresponding importance """ - go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects") + go = import_or_raise( + "plotly.graph_objects", + error_msg="Cannot find dependency plotly.graph_objects", + ) if jupyter_check(): import_or_raise("ipywidgets", warning=True) feat_imp = self.feature_importance - feat_imp['importance'] = abs(feat_imp['importance']) + feat_imp["importance"] = abs(feat_imp["importance"]) if importance_threshold < 0: - raise ValueError(f'Provided importance threshold of {importance_threshold} must be greater than or equal to 0') + raise ValueError( + f"Provided importance threshold of {importance_threshold} must be greater than or equal to 0" + ) # Remove features with importance whose absolute value is less than importance threshold - feat_imp = feat_imp[feat_imp['importance'] >= importance_threshold] + feat_imp = feat_imp[feat_imp["importance"] >= importance_threshold] # List is reversed to go from ascending order to descending order feat_imp = feat_imp.iloc[::-1] - title = 'Feature Importance' - subtitle = 'May display fewer features due to feature selection' - data = [go.Bar( - x=feat_imp['importance'], - y=feat_imp['feature'], - orientation='h' - )] + title = "Feature Importance" + subtitle = "May display fewer features due to feature selection" + data = [ + go.Bar(x=feat_imp["importance"], y=feat_imp["feature"], orientation="h") + ] layout = { - 'title': '{0}
{1}'.format(title, subtitle), - 'height': 800, - 'xaxis_title': 'Feature Importance', - 'yaxis_title': 'Feature', - 'yaxis': { - 'type': 'category' - } + "title": "{0}
{1}".format(title, subtitle), + "height": 800, + "xaxis_title": "Feature Importance", + "yaxis_title": "Feature", + "yaxis": {"type": "category"}, } fig = go.Figure(data=data, layout=layout) @@ -441,7 +496,7 @@ def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): Returns: None """ - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: cloudpickle.dump(self, f, protocol=pickle_protocol) @staticmethod @@ -454,7 +509,7 @@ def load(file_path): Returns: PipelineBase object """ - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: return cloudpickle.load(f) def clone(self): @@ -463,7 +518,12 @@ def clone(self): Returns: A new instance of this pipeline with identical components, parameters, and random state. """ - return self.__class__(self.component_graph, parameters=self.parameters, custom_name=self.custom_name, random_seed=self.random_seed) + return self.__class__( + self.component_graph, + parameters=self.parameters, + custom_name=self.custom_name, + random_seed=self.random_seed, + ) def new(self, parameters, random_seed=0): """Constructs a new instance of the pipeline with the same component graph but with a different set of parameters. @@ -476,7 +536,12 @@ def new(self, parameters, random_seed=0): Returns: A new instance of this pipeline with identical components. """ - return self.__class__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + return self.__class__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def __eq__(self, other): if not isinstance(other, self.__class__): @@ -484,7 +549,13 @@ def __eq__(self, other): random_seed_eq = self.random_seed == other.random_seed if not random_seed_eq: return False - attributes_to_check = ['parameters', '_is_fitted', 'component_graph', 'input_feature_names', 'input_target_name'] + attributes_to_check = [ + "parameters", + "_is_fitted", + "component_graph", + "input_feature_names", + "input_target_name", + ] for attribute in attributes_to_check: if getattr(self, attribute) != getattr(other, attribute): return False @@ -495,19 +566,39 @@ def __str__(self): def __repr__(self): def repr_component(parameters): - return ', '.join([f"'{key}': {safe_repr(value)}" for key, value in parameters.items()]) + return ", ".join( + [f"'{key}': {safe_repr(value)}" for key, value in parameters.items()] + ) - component_graph_repr = ", ".join([f"'{component}'" if isinstance(component, str) else component.__name__ for component in self.component_graph]) + component_graph_repr = ", ".join( + [ + f"'{component}'" if isinstance(component, str) else component.__name__ + for component in self.component_graph + ] + ) component_graph_str = f"[{component_graph_repr}]" - parameters_repr = ', '.join([f"'{component}':{{{repr_component(parameters)}}}" for component, parameters in self.parameters.items()]) + parameters_repr = ", ".join( + [ + f"'{component}':{{{repr_component(parameters)}}}" + for component, parameters in self.parameters.items() + ] + ) parameters_str = f"parameters={{{parameters_repr}}}" - custom_name_repr = f"custom_name='{self.custom_name}'" if self.custom_name else None + custom_name_repr = ( + f"custom_name='{self.custom_name}'" if self.custom_name else None + ) random_seed_str = f"random_seed={self.random_seed}" - additional_args_str = ", ".join([arg for arg in [parameters_str, custom_name_repr, random_seed_str] if arg is not None]) + additional_args_str = ", ".join( + [ + arg + for arg in [parameters_str, custom_name_repr, random_seed_str] + if arg is not None + ] + ) - return f'pipeline = {(type(self).__name__)}(component_graph={component_graph_str}, {additional_args_str})' + return f"pipeline = {(type(self).__name__)}(component_graph={component_graph_str}, {additional_args_str})" def __iter__(self): return self @@ -520,20 +611,40 @@ def _get_feature_provenance(self): @property def _supports_fast_permutation_importance(self): - has_more_than_one_estimator = sum(isinstance(c, Estimator) for c in self._component_graph) > 1 + has_more_than_one_estimator = ( + sum(isinstance(c, Estimator) for c in self._component_graph) > 1 + ) _all_components = set(all_components()) - has_custom_components = any(c.__class__ not in _all_components for c in self._component_graph) - has_dim_reduction = any(isinstance(c, (PCA, LinearDiscriminantAnalysis)) for c in self._component_graph) + has_custom_components = any( + c.__class__ not in _all_components for c in self._component_graph + ) + has_dim_reduction = any( + isinstance(c, (PCA, LinearDiscriminantAnalysis)) + for c in self._component_graph + ) has_dfs = any(isinstance(c, DFSTransformer) for c in self._component_graph) - has_stacked_ensembler = any(isinstance(c, (StackedEnsembleClassifier, StackedEnsembleRegressor)) for c in self._component_graph) - return not any([has_more_than_one_estimator, has_custom_components, has_dim_reduction, has_dfs, has_stacked_ensembler]) + has_stacked_ensembler = any( + isinstance(c, (StackedEnsembleClassifier, StackedEnsembleRegressor)) + for c in self._component_graph + ) + return not any( + [ + has_more_than_one_estimator, + has_custom_components, + has_dim_reduction, + has_dfs, + has_stacked_ensembler, + ] + ) @staticmethod def create_objectives(objectives): objective_instances = [] for objective in objectives: try: - objective_instances.append(get_objective(objective, return_instance=True)) + objective_instances.append( + get_objective(objective, return_instance=True) + ) except ObjectiveCreationError as e: msg = f"Cannot pass {objective} as a string in pipeline.score. Instantiate first and then add it to the list of objectives." raise ObjectiveCreationError(msg) from e @@ -542,12 +653,16 @@ def create_objectives(objectives): def can_tune_threshold_with_objective(self, objective): """Determine whether the threshold of a binary classification pipeline can be tuned. - Arguments: - pipeline (PipelineBase): Binary classification pipeline. - objective (ObjectiveBase): Primary AutoMLSearch objective. + Arguments: + pipeline (PipelineBase): Binary classification pipeline. + objective (ObjectiveBase): Primary AutoMLSearch objective. - Returns: - bool: True if the pipeline threshold can be tuned. + Returns: + bool: True if the pipeline threshold can be tuned. """ - return is_binary(self.problem_type) and objective.is_defined_for_problem_type(self.problem_type) and objective.can_optimize_threshold + return ( + is_binary(self.problem_type) + and objective.is_defined_for_problem_type(self.problem_type) + and objective.can_optimize_threshold + ) diff --git a/evalml/pipelines/pipeline_meta.py b/evalml/pipelines/pipeline_meta.py index 36e81d1b45..3d5b4c7500 100644 --- a/evalml/pipelines/pipeline_meta.py +++ b/evalml/pipelines/pipeline_meta.py @@ -1,5 +1,3 @@ - - from functools import wraps from evalml.exceptions import PipelineNotYetFittedError @@ -12,19 +10,23 @@ class PipelineBaseMeta(BaseMeta): @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the wrapped method if `True`. + It raises an exception if `False` and calls and returns the wrapped method if `True`. """ + @wraps(method) def _check_for_fit(self, X=None, objective=None): klass = type(self).__name__ if not self._is_fitted: - raise PipelineNotYetFittedError(f'This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.') - if method.__name__ == 'predict_proba': + raise PipelineNotYetFittedError( + f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}." + ) + if method.__name__ == "predict_proba": return method(self, X) - elif method.__name__ == 'predict': + elif method.__name__ == "predict": return method(self, X, objective) else: return method(self) + return _check_for_fit @@ -34,15 +36,19 @@ class TimeSeriesPipelineBaseMeta(PipelineBaseMeta): @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. - It raises an exception if `False` and calls and returns the wrapped method if `True`. + It raises an exception if `False` and calls and returns the wrapped method if `True`. """ + @wraps(method) def _check_for_fit(self, X=None, y=None, objective=None): klass = type(self).__name__ if not self._is_fitted: - raise PipelineNotYetFittedError(f'This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.') - if method.__name__ == 'predict_proba': + raise PipelineNotYetFittedError( + f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}." + ) + if method.__name__ == "predict_proba": return method(self, X, y) - elif method.__name__ == 'predict': + elif method.__name__ == "predict": return method(self, X, y, objective) + return _check_for_fit diff --git a/evalml/pipelines/regression_pipeline.py b/evalml/pipelines/regression_pipeline.py index 12f6036d51..496a0951f0 100644 --- a/evalml/pipelines/regression_pipeline.py +++ b/evalml/pipelines/regression_pipeline.py @@ -1,4 +1,3 @@ - from evalml.pipelines import PipelineBase from evalml.problem_types import ProblemTypes from evalml.utils import infer_feature_types @@ -6,6 +5,7 @@ class RegressionPipeline(PipelineBase): """Pipeline subclass for all regression pipelines.""" + problem_type = ProblemTypes.REGRESSION def fit(self, X, y): @@ -39,4 +39,6 @@ def score(self, X, y, objectives): """ objectives = self.create_objectives(objectives) y_predicted = self.predict(X) - return self._score_all_objectives(X, y, y_predicted, y_pred_proba=None, objectives=objectives) + return self._score_all_objectives( + X, y, y_predicted, y_pred_proba=None, objectives=objectives + ) diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 4111fd7b79..e4a4e07d46 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -1,25 +1,22 @@ - import pandas as pd -from .binary_classification_pipeline_mixin import ( - BinaryClassificationPipelineMixin -) +from .binary_classification_pipeline_mixin import BinaryClassificationPipelineMixin from evalml.objectives import get_objective from evalml.pipelines.classification_pipeline import ClassificationPipeline from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta from evalml.problem_types import ProblemTypes -from evalml.utils import ( - drop_rows_with_nans, - infer_feature_types, - pad_with_nans -) +from evalml.utils import drop_rows_with_nans, infer_feature_types, pad_with_nans -class TimeSeriesClassificationPipeline(ClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta): +class TimeSeriesClassificationPipeline( + ClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta +): """Pipeline base class for time series classification problems.""" - def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0): + def __init__( + self, component_graph, parameters=None, custom_name=None, random_seed=0 + ): """Machine learning pipeline for time series classification problems made out of transformers and a classifier. Arguments: @@ -35,16 +32,20 @@ def __init__(self, component_graph, parameters=None, custom_name=None, random_se random_seed (int): Seed for the random number generator. Defaults to 0. """ if "pipeline" not in parameters: - raise ValueError("date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " - "Please specify them as a dictionary with the key 'pipeline'.") + raise ValueError( + "date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " + "Please specify them as a dictionary with the key 'pipeline'." + ) pipeline_params = parameters["pipeline"] - self.date_index = pipeline_params['date_index'] - self.gap = pipeline_params['gap'] - self.max_delay = pipeline_params['max_delay'] - super().__init__(component_graph, - custom_name=custom_name, - parameters=parameters, - random_seed=random_seed) + self.date_index = pipeline_params["date_index"] + self.gap = pipeline_params["gap"] + self.max_delay = pipeline_params["max_delay"] + super().__init__( + component_graph, + custom_name=custom_name, + parameters=parameters, + random_seed=random_seed, + ) @staticmethod def _convert_to_woodwork(X, y): @@ -99,7 +100,9 @@ def _predict(self, X, y, objective=None, pad=False): features_no_nan, y_no_nan = drop_rows_with_nans(features, y) predictions = self._estimator_predict(features_no_nan, y_no_nan) if pad: - padded = pad_with_nans(predictions, max(0, features.shape[0] - predictions.shape[0])) + padded = pad_with_nans( + predictions, max(0, features.shape[0] - predictions.shape[0]) + ) return infer_feature_types(padded) return predictions @@ -120,7 +123,9 @@ def predict(self, X, y=None, objective=None): predictions = self._predict(X, y, objective=objective, pad=False) # In case gap is 0 and this is a baseline pipeline, we drop the nans in the # predictions before decoding them - predictions = pd.Series(self._decode_targets(predictions.dropna()), name=self.input_target_name) + predictions = pd.Series( + self._decode_targets(predictions.dropna()), name=self.input_target_name + ) padded = pad_with_nans(predictions, max(0, n_features - predictions.shape[0])) return infer_feature_types(padded) @@ -158,14 +163,26 @@ def score(self, X, y, objectives): y_encoded = self._encode_targets(y) y_shifted = y_encoded.shift(-self.gap) - y_predicted, y_predicted_proba = self._compute_predictions(X, y, objectives, time_series=True) - y_shifted, y_predicted, y_predicted_proba = drop_rows_with_nans(y_shifted, y_predicted, y_predicted_proba) - return self._score_all_objectives(X, y_shifted, y_predicted, - y_pred_proba=y_predicted_proba, - objectives=objectives) - - -class TimeSeriesBinaryClassificationPipeline(BinaryClassificationPipelineMixin, TimeSeriesClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta): + y_predicted, y_predicted_proba = self._compute_predictions( + X, y, objectives, time_series=True + ) + y_shifted, y_predicted, y_predicted_proba = drop_rows_with_nans( + y_shifted, y_predicted, y_predicted_proba + ) + return self._score_all_objectives( + X, + y_shifted, + y_predicted, + y_pred_proba=y_predicted_proba, + objectives=objectives, + ) + + +class TimeSeriesBinaryClassificationPipeline( + BinaryClassificationPipelineMixin, + TimeSeriesClassificationPipeline, + metaclass=TimeSeriesPipelineBaseMeta, +): problem_type = ProblemTypes.TIME_SERIES_BINARY def _predict(self, X, y, objective=None, pad=False): @@ -175,7 +192,9 @@ def _predict(self, X, y, objective=None, pad=False): if objective is not None: objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(self.problem_type): - raise ValueError(f"Objective {objective.name} is not defined for time series binary classification.") + raise ValueError( + f"Objective {objective.name} is not defined for time series binary classification." + ) if self.threshold is None: predictions = self._estimator_predict(features_no_nan, y_no_nan) @@ -185,15 +204,18 @@ def _predict(self, X, y, objective=None, pad=False): if objective is None: predictions = proba > self.threshold else: - predictions = objective.decision_function(proba, threshold=self.threshold, X=features_no_nan) + predictions = objective.decision_function( + proba, threshold=self.threshold, X=features_no_nan + ) if pad: - predictions = pad_with_nans(predictions, max(0, features.shape[0] - predictions.shape[0])) + predictions = pad_with_nans( + predictions, max(0, features.shape[0] - predictions.shape[0]) + ) return infer_feature_types(predictions) @staticmethod def _score(X, y, predictions, objective): - """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score. - """ + """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score.""" if predictions.ndim > 1: predictions = predictions.iloc[:, 1] return TimeSeriesClassificationPipeline._score(X, y, predictions, objective) diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index ef34c7132a..834ff2233e 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -3,19 +3,19 @@ from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta from evalml.pipelines.regression_pipeline import RegressionPipeline from evalml.problem_types import ProblemTypes -from evalml.utils import ( - drop_rows_with_nans, - infer_feature_types, - pad_with_nans -) +from evalml.utils import drop_rows_with_nans, infer_feature_types, pad_with_nans -class TimeSeriesRegressionPipeline(RegressionPipeline, metaclass=TimeSeriesPipelineBaseMeta): +class TimeSeriesRegressionPipeline( + RegressionPipeline, metaclass=TimeSeriesPipelineBaseMeta +): """Pipeline base class for time series regression problems.""" problem_type = ProblemTypes.TIME_SERIES_REGRESSION - def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0): + def __init__( + self, component_graph, parameters=None, custom_name=None, random_seed=0 + ): """Machine learning pipeline for time series regression problems made out of transformers and a classifier. Arguments: @@ -31,16 +31,20 @@ def __init__(self, component_graph, parameters=None, custom_name=None, random_se random_seed (int): Seed for the random number generator. Defaults to 0. """ if "pipeline" not in parameters: - raise ValueError("date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " - "Please specify them as a dictionary with the key 'pipeline'.") + raise ValueError( + "date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " + "Please specify them as a dictionary with the key 'pipeline'." + ) pipeline_params = parameters["pipeline"] - self.date_index = pipeline_params['date_index'] - self.gap = pipeline_params['gap'] - self.max_delay = pipeline_params['max_delay'] - super().__init__(component_graph, - custom_name=custom_name, - parameters=parameters, - random_seed=random_seed) + self.date_index = pipeline_params["date_index"] + self.gap = pipeline_params["gap"] + self.max_delay = pipeline_params["max_delay"] + super().__init__( + component_graph, + custom_name=custom_name, + parameters=parameters, + random_seed=random_seed, + ) def fit(self, X, y): """Fit a time series regression pipeline. @@ -88,7 +92,9 @@ def predict(self, X, y=None, objective=None): y_arg = y predictions = self.estimator.predict(features_no_nan, y_arg) predictions = predictions.rename(self.input_target_name) - padded = pad_with_nans(predictions, max(0, features.shape[0] - predictions.shape[0])) + padded = pad_with_nans( + predictions, max(0, features.shape[0] - predictions.shape[0]) + ) return infer_feature_types(padded) def score(self, X, y, objectives): @@ -113,7 +119,6 @@ def score(self, X, y, objectives): y_shifted = y.shift(-self.gap) objectives = self.create_objectives(objectives) y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted) - return self._score_all_objectives(X, y_shifted, - y_predicted, - y_pred_proba=None, - objectives=objectives) + return self._score_all_objectives( + X, y_shifted, y_predicted, y_pred_proba=None, objectives=objectives + ) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 49172b0c78..50fd437189 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1,13 +1,11 @@ from woodwork import logical_types from .binary_classification_pipeline import BinaryClassificationPipeline -from .multiclass_classification_pipeline import ( - MulticlassClassificationPipeline -) +from .multiclass_classification_pipeline import MulticlassClassificationPipeline from .regression_pipeline import RegressionPipeline from .time_series_classification_pipelines import ( TimeSeriesBinaryClassificationPipeline, - TimeSeriesMulticlassClassificationPipeline + TimeSeriesMulticlassClassificationPipeline, ) from .time_series_regression_pipeline import TimeSeriesRegressionPipeline @@ -34,21 +32,23 @@ StandardScaler, TargetImputer, TextFeaturizer, - Undersampler + Undersampler, ) from evalml.pipelines.components.utils import get_estimators from evalml.problem_types import ( ProblemTypes, handle_problem_types, is_classification, - is_time_series + is_time_series, ) from evalml.utils import get_logger, import_or_raise, infer_feature_types logger = get_logger(__file__) -def _get_preprocessing_components(X, y, problem_type, estimator_class, sampler_name=None): +def _get_preprocessing_components( + X, y, problem_type, estimator_class, sampler_name=None +): """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. Arguments: @@ -67,16 +67,20 @@ def _get_preprocessing_components(X, y, problem_type, estimator_class, sampler_n if len(all_null_cols) > 0: pp_components.append(DropNullColumns) input_logical_types = set(X.ww.logical_types.values()) - types_imputer_handles = {logical_types.Boolean, logical_types.Categorical, - logical_types.Double, logical_types.Integer} + types_imputer_handles = { + logical_types.Boolean, + logical_types.Categorical, + logical_types.Double, + logical_types.Integer, + } if len(input_logical_types.intersection(types_imputer_handles)) > 0: pp_components.append(Imputer) - text_columns = list(X.ww.select('NaturalLanguage').columns) + text_columns = list(X.ww.select("NaturalLanguage").columns) if len(text_columns) > 0: pp_components.append(TextFeaturizer) - index_columns = list(X.ww.select('index').columns) + index_columns = list(X.ww.select("index").columns) if len(index_columns) > 0: pp_components.append(DropColumns) @@ -86,25 +90,35 @@ def _get_preprocessing_components(X, y, problem_type, estimator_class, sampler_n if add_datetime_featurizer and estimator_class.model_family != ModelFamily.ARIMA: pp_components.append(DateTimeFeaturizer) - if is_time_series(problem_type) and estimator_class.model_family != ModelFamily.ARIMA: + if ( + is_time_series(problem_type) + and estimator_class.model_family != ModelFamily.ARIMA + ): pp_components.append(DelayedFeatureTransformer) - categorical_cols = X.ww.select('category') - if len(categorical_cols.columns) > 0 and estimator_class not in {CatBoostClassifier, CatBoostRegressor}: + categorical_cols = X.ww.select("category") + if len(categorical_cols.columns) > 0 and estimator_class not in { + CatBoostClassifier, + CatBoostRegressor, + }: pp_components.append(OneHotEncoder) sampler_components = { "Undersampler": Undersampler, "SMOTE Oversampler": SMOTESampler, "SMOTENC Oversampler": SMOTENCSampler, - "SMOTEN Oversampler": SMOTENSampler + "SMOTEN Oversampler": SMOTENSampler, } if sampler_name is not None: try: - import_or_raise("imblearn.over_sampling", error_msg="imbalanced-learn is not installed") + import_or_raise( + "imblearn.over_sampling", error_msg="imbalanced-learn is not installed" + ) pp_components.append(sampler_components[sampler_name]) except ImportError: - logger.debug(f'Could not import imblearn.over_sampling, so defaulting to use Undersampler') + logger.debug( + f"Could not import imblearn.over_sampling, so defaulting to use Undersampler" + ) pp_components.append(Undersampler) if estimator_class.model_family == ModelFamily.LINEAR_MODEL: @@ -131,21 +145,21 @@ def _get_pipeline_base_class(problem_type): def make_pipeline(X, y, estimator, problem_type, parameters=None, sampler_name=None): """Given input data, target data, an estimator class and the problem type, - generates a pipeline class with a preprocessing chain which was recommended based on the inputs. - The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. - - Arguments: - X (pd.DataFrame): The input data of shape [n_samples, n_features] - y (pd.Series): The target data of length [n_samples] - estimator (Estimator): Estimator for pipeline - problem_type (ProblemTypes or str): Problem type for pipeline to generate - parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. - An empty dictionary or None implies using all default values for component parameters. - sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. - Defaults to None + generates a pipeline class with a preprocessing chain which was recommended based on the inputs. + The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. - Returns: - PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator + Arguments: + X (pd.DataFrame): The input data of shape [n_samples, n_features] + y (pd.Series): The target data of length [n_samples] + estimator (Estimator): Estimator for pipeline + problem_type (ProblemTypes or str): Problem type for pipeline to generate + parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. + An empty dictionary or None implies using all default values for component parameters. + sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. + Defaults to None + + Returns: + PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator """ X = infer_feature_types(X) @@ -155,8 +169,12 @@ def make_pipeline(X, y, estimator, problem_type, parameters=None, sampler_name=N if estimator not in get_estimators(problem_type): raise ValueError(f"{estimator.name} is not a valid estimator for problem type") if not is_classification(problem_type) and sampler_name is not None: - raise ValueError(f"Sampling is unsupported for problem_type {str(problem_type)}") - preprocessing_components = _get_preprocessing_components(X, y, problem_type, estimator, sampler_name) + raise ValueError( + f"Sampling is unsupported for problem_type {str(problem_type)}" + ) + preprocessing_components = _get_preprocessing_components( + X, y, problem_type, estimator, sampler_name + ) complete_component_graph = preprocessing_components + [estimator] base_class = _get_pipeline_base_class(problem_type) @@ -176,15 +194,23 @@ def generate_pipeline_code(element): # hold the imports needed and add code to end code_strings = [] if not isinstance(element, PipelineBase): - raise ValueError("Element must be a pipeline instance, received {}".format(type(element))) + raise ValueError( + "Element must be a pipeline instance, received {}".format(type(element)) + ) if isinstance(element.component_graph, dict): raise ValueError("Code generation for nonlinear pipelines is not supported yet") - code_strings.append("from {} import {}".format(element.__class__.__module__, element.__class__.__name__)) + code_strings.append( + "from {} import {}".format( + element.__class__.__module__, element.__class__.__name__ + ) + ) code_strings.append(repr(element)) return "\n".join(code_strings) -def _make_stacked_ensemble_pipeline(input_pipelines, problem_type, n_jobs=-1, random_seed=0): +def _make_stacked_ensemble_pipeline( + input_pipelines, problem_type, n_jobs=-1, random_seed=0 +): """ Creates a pipeline with a stacked ensemble estimator. @@ -201,20 +227,43 @@ def _make_stacked_ensemble_pipeline(input_pipelines, problem_type, n_jobs=-1, ra """ parameters = {} if is_classification(problem_type): - parameters = {"Stacked Ensemble Classifier": {"input_pipelines": input_pipelines, "n_jobs": n_jobs}} + parameters = { + "Stacked Ensemble Classifier": { + "input_pipelines": input_pipelines, + "n_jobs": n_jobs, + } + } estimator = StackedEnsembleClassifier else: - parameters = {"Stacked Ensemble Regressor": {"input_pipelines": input_pipelines, "n_jobs": n_jobs}} + parameters = { + "Stacked Ensemble Regressor": { + "input_pipelines": input_pipelines, + "n_jobs": n_jobs, + } + } estimator = StackedEnsembleRegressor pipeline_class, pipeline_name = { - ProblemTypes.BINARY: (BinaryClassificationPipeline, "Stacked Ensemble Classification Pipeline"), - ProblemTypes.MULTICLASS: (MulticlassClassificationPipeline, "Stacked Ensemble Classification Pipeline"), - ProblemTypes.REGRESSION: (RegressionPipeline, "Stacked Ensemble Regression Pipeline")}[problem_type] - - return pipeline_class([estimator], parameters=parameters, - custom_name=pipeline_name, - random_seed=random_seed) + ProblemTypes.BINARY: ( + BinaryClassificationPipeline, + "Stacked Ensemble Classification Pipeline", + ), + ProblemTypes.MULTICLASS: ( + MulticlassClassificationPipeline, + "Stacked Ensemble Classification Pipeline", + ), + ProblemTypes.REGRESSION: ( + RegressionPipeline, + "Stacked Ensemble Regression Pipeline", + ), + }[problem_type] + + return pipeline_class( + [estimator], + parameters=parameters, + custom_name=pipeline_name, + random_seed=random_seed, + ) def _make_component_list_from_actions(actions): @@ -234,5 +283,7 @@ def _make_component_list_from_actions(actions): if action.action_code == DataCheckActionCode.IMPUTE_COL: metadata = action.metadata if metadata["is_target"]: - components.append(TargetImputer(impute_strategy=metadata["impute_strategy"])) + components.append( + TargetImputer(impute_strategy=metadata["impute_strategy"]) + ) return components diff --git a/evalml/preprocessing/__init__.py b/evalml/preprocessing/__init__.py index 8a49b9f2f7..fa728de2f2 100644 --- a/evalml/preprocessing/__init__.py +++ b/evalml/preprocessing/__init__.py @@ -3,6 +3,6 @@ split_data, number_of_features, target_distribution, - drop_nan_target_rows + drop_nan_target_rows, ) from .data_splitters import TrainingValidationSplit, TimeSeriesSplit diff --git a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py index 0da810ab76..ffe0ee845a 100644 --- a/evalml/preprocessing/data_splitters/balanced_classification_sampler.py +++ b/evalml/preprocessing/data_splitters/balanced_classification_sampler.py @@ -7,7 +7,14 @@ class BalancedClassificationSampler(SamplerBase): """Class for balanced classification downsampler.""" - def __init__(self, sampling_ratio=0.25, sampling_ratio_dict=None, min_samples=100, min_percentage=0.1, random_seed=0): + def __init__( + self, + sampling_ratio=0.25, + sampling_ratio_dict=None, + min_samples=100, + min_percentage=0.1, + random_seed=0, + ): """ Arguments: sampling_ratio (float): The smallest minority:majority ratio that is accepted as 'balanced'. For instance, a 1:4 ratio would be @@ -28,11 +35,17 @@ def __init__(self, sampling_ratio=0.25, sampling_ratio_dict=None, min_samples=10 """ super().__init__(random_seed=random_seed) if sampling_ratio <= 0 or sampling_ratio > 1: - raise ValueError(f"sampling_ratio must be within (0, 1], but received {sampling_ratio}") + raise ValueError( + f"sampling_ratio must be within (0, 1], but received {sampling_ratio}" + ) if min_samples <= 0: - raise ValueError(f"min_sample must be greater than 0, but received {min_samples}") + raise ValueError( + f"min_sample must be greater than 0, but received {min_samples}" + ) if min_percentage <= 0 or min_percentage > 0.5: - raise ValueError(f"min_percentage must be between 0 and 0.5, inclusive, but received {min_percentage}") + raise ValueError( + f"min_percentage must be between 0 and 0.5, inclusive, but received {min_percentage}" + ) self.sampling_ratio = sampling_ratio self.min_samples = min_samples self.min_percentage = min_percentage @@ -58,13 +71,17 @@ def _find_ideal_samples(self, y): return {} # if any classes have less than min_samples counts and are less than min_percentage of the total data, # then it's severely imbalanced - if any(counts < self.min_samples) and any(normalized_counts < self.min_percentage): + if any(counts < self.min_samples) and any( + normalized_counts < self.min_percentage + ): return {} # otherwise, we are imbalanced enough to perform on this undersample_classes = counts[class_ratios <= self.sampling_ratio].index.values # find goal size, round it down if it's a float minority_class = min(counts.values) - goal_value = max(int((minority_class / self.sampling_ratio) // 1), self.min_samples) + goal_value = max( + int((minority_class / self.sampling_ratio) // 1), self.min_samples + ) # we don't want to drop less than 0 rows drop_values = {k: max(0, counts[k] - goal_value) for k in undersample_classes} return {k: v for k, v in drop_values.items() if v > 0} @@ -106,7 +123,9 @@ def fit_resample(self, X, y): # iterate through the classes we need to undersample and remove the number of samples we need to remove for key, value in result.items(): indices = y.index[y == key].values - indices_to_remove = self.random_state.choice(indices, value, replace=False) + indices_to_remove = self.random_state.choice( + indices, value, replace=False + ) indices_to_drop.extend(indices_to_remove) # indices of the y series original_indices = list(set(y.index.values).difference(set(indices_to_drop))) diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index 8b988ec932..773969f292 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -23,7 +23,7 @@ def __init__(self, max_delay=0, gap=0, date_index=None, n_splits=3): since we are interested in date_index (str): Name of the column containing the datetime information used to order the data. n_splits (int): number of data splits to make. - """ + """ self.max_delay = max_delay self.gap = gap self.date_index = date_index @@ -57,7 +57,9 @@ def split(self, X, y=None, groups=None): # TimeSeriesPipeline convention of being able to pass in empty X dataframes # We'll do this by passing X=y if X is empty if self._check_if_empty(X) and self._check_if_empty(y): - raise ValueError("Both X and y cannot be None or empty in TimeSeriesSplit.split") + raise ValueError( + "Both X and y cannot be None or empty in TimeSeriesSplit.split" + ) elif self._check_if_empty(X) and not self._check_if_empty(y): split_kwargs = dict(X=y, groups=groups) max_index = y.shape[0] @@ -65,13 +67,15 @@ def split(self, X, y=None, groups=None): split_kwargs = dict(X=X, y=y, groups=groups) max_index = X.shape[0] - split_size = (max_index // self.n_splits) + split_size = max_index // self.n_splits if split_size < self.gap + self.max_delay: - raise ValueError(f"Since the data has {max_index} observations and n_splits={self.n_splits}, " - f"the smallest split would have {split_size} observations. " - f"Since {self.gap + self.max_delay} (gap + max_delay) > {split_size}, " - "then at least one of the splits would be empty by the time it reaches the pipeline. " - "Please use a smaller number of splits or collect more data.") + raise ValueError( + f"Since the data has {max_index} observations and n_splits={self.n_splits}, " + f"the smallest split would have {split_size} observations. " + f"Since {self.gap + self.max_delay} (gap + max_delay) > {split_size}, " + "then at least one of the splits would be empty by the time it reaches the pipeline. " + "Please use a smaller number of splits or collect more data." + ) for train, test in self._splitter.split(**split_kwargs): last_train = train[-1] diff --git a/evalml/preprocessing/data_splitters/training_validation_split.py b/evalml/preprocessing/data_splitters/training_validation_split.py index a6f4d0a693..15df8ca63b 100644 --- a/evalml/preprocessing/data_splitters/training_validation_split.py +++ b/evalml/preprocessing/data_splitters/training_validation_split.py @@ -6,7 +6,14 @@ class TrainingValidationSplit(BaseCrossValidator): """Split the training data into training and validation sets""" - def __init__(self, test_size=None, train_size=None, shuffle=False, stratify=None, random_seed=0): + def __init__( + self, + test_size=None, + train_size=None, + shuffle=False, + stratify=None, + random_seed=0, + ): """Create a TrainingValidation instance Arguments: @@ -35,12 +42,19 @@ def get_n_splits(): def split(self, X, y=None): """Divides the data into training and testing sets - Arguments: - X (pd.DataFrame): Dataframe of points to split - y (pd.Series): Series of points to split + Arguments: + X (pd.DataFrame): Dataframe of points to split + y (pd.Series): Series of points to split - Returns: - list: Indices to split data into training and test set + Returns: + list: Indices to split data into training and test set """ - train, test = train_test_split(np.arange(X.shape[0]), test_size=self.test_size, train_size=self.train_size, shuffle=self.shuffle, stratify=self.stratify, random_state=self.random_seed) + train, test = train_test_split( + np.arange(X.shape[0]), + test_size=self.test_size, + train_size=self.train_size, + shuffle=self.shuffle, + stratify=self.stratify, + random_state=self.random_seed, + ) return iter([(train, test)]) diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 82c04d7c7c..e822bd4115 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -2,11 +2,7 @@ from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit from evalml.preprocessing.data_splitters import TrainingValidationSplit -from evalml.problem_types import ( - is_classification, - is_regression, - is_time_series -) +from evalml.problem_types import is_classification, is_regression, is_time_series from evalml.utils import infer_feature_types @@ -33,11 +29,11 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwarg if verbose: # number of features - print(number_of_features(X.dtypes), end='\n\n') + print(number_of_features(X.dtypes), end="\n\n") # number of total training examples - info = 'Number of training examples: {}' - print(info.format(len(X)), end='\n') + info = "Number of training examples: {}" + print(info.format(len(X)), end="\n") # target distribution print(target_distribution(y)) @@ -45,7 +41,9 @@ def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwarg return infer_feature_types(X), infer_feature_types(y) -def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0): +def split_data( + X, y, problem_type, problem_configuration=None, test_size=0.2, random_seed=0 +): """Splits data into train and test sets. Arguments: @@ -65,11 +63,17 @@ def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, ran data_splitter = None if is_time_series(problem_type): - data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed) + data_splitter = TrainingValidationSplit( + test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed + ) elif is_regression(problem_type): - data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) + data_splitter = ShuffleSplit( + n_splits=1, test_size=test_size, random_state=random_seed + ) elif is_classification(problem_type): - data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) + data_splitter = StratifiedShuffleSplit( + n_splits=1, test_size=test_size, random_state=random_seed + ) train, test = next(data_splitter.split(X, y)) @@ -91,16 +95,16 @@ def number_of_features(dtypes): pd.Series: dtypes and the number of features for each input type """ dtype_to_vtype = { - 'bool': 'Boolean', - 'int32': 'Numeric', - 'int64': 'Numeric', - 'float64': 'Numeric', - 'object': 'Categorical', - 'datetime64[ns]': 'Datetime', + "bool": "Boolean", + "int32": "Numeric", + "int64": "Numeric", + "float64": "Numeric", + "object": "Categorical", + "datetime64[ns]": "Datetime", } vtypes = dtypes.astype(str).map(dtype_to_vtype).value_counts() - return vtypes.sort_index().to_frame('Number of Features') + return vtypes.sort_index().to_frame("Number of Features") def target_distribution(targets): @@ -113,7 +117,7 @@ def target_distribution(targets): pd.Series: Target data and their frequency distribution as percentages. """ distribution = targets.value_counts() / len(targets) - return distribution.mul(100).apply('{:.2f}%'.format).rename_axis('Targets') + return distribution.mul(100).apply("{:.2f}%".format).rename_axis("Targets") def drop_nan_target_rows(X, y): diff --git a/evalml/problem_types/__init__.py b/evalml/problem_types/__init__.py index d61a30fb8f..6c1dd73d60 100644 --- a/evalml/problem_types/__init__.py +++ b/evalml/problem_types/__init__.py @@ -1,2 +1,10 @@ from .problem_types import ProblemTypes -from .utils import handle_problem_types, detect_problem_type, is_regression, is_binary, is_multiclass, is_classification, is_time_series +from .utils import ( + handle_problem_types, + detect_problem_type, + is_regression, + is_binary, + is_multiclass, + is_classification, + is_time_series, +) diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index 729d3ecc56..faa4d539fb 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -5,26 +5,29 @@ class ProblemTypes(Enum): """Enum defining the supported types of machine learning problems.""" - BINARY = 'binary' + + BINARY = "binary" """Binary classification problem.""" - MULTICLASS = 'multiclass' + MULTICLASS = "multiclass" """Multiclass classification problem.""" - REGRESSION = 'regression' + REGRESSION = "regression" """Regression problem.""" - TIME_SERIES_REGRESSION = 'time series regression' + TIME_SERIES_REGRESSION = "time series regression" """Time series regression problem.""" - TIME_SERIES_BINARY = 'time series binary' + TIME_SERIES_BINARY = "time series binary" """Time series binary classification problem.""" - TIME_SERIES_MULTICLASS = 'time series multiclass' + TIME_SERIES_MULTICLASS = "time series multiclass" """Time series multiclass classification problem.""" def __str__(self): - problem_type_dict = {ProblemTypes.BINARY.name: "binary", - ProblemTypes.MULTICLASS.name: "multiclass", - ProblemTypes.REGRESSION.name: "regression", - ProblemTypes.TIME_SERIES_REGRESSION.name: "time series regression", - ProblemTypes.TIME_SERIES_BINARY.name: "time series binary", - ProblemTypes.TIME_SERIES_MULTICLASS.name: "time series multiclass"} + problem_type_dict = { + ProblemTypes.BINARY.name: "binary", + ProblemTypes.MULTICLASS.name: "multiclass", + ProblemTypes.REGRESSION.name: "regression", + ProblemTypes.TIME_SERIES_REGRESSION.name: "time series regression", + ProblemTypes.TIME_SERIES_BINARY.name: "time series binary", + ProblemTypes.TIME_SERIES_MULTICLASS.name: "time series multiclass", + } return problem_type_dict[self.name] @classproperty diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index fda3350238..9fa83e6804 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -17,11 +17,13 @@ def handle_problem_types(problem_type): try: tpe = ProblemTypes._all_values[problem_type.upper()] except KeyError: - raise KeyError('Problem type \'{}\' does not exist'.format(problem_type)) + raise KeyError("Problem type '{}' does not exist".format(problem_type)) return tpe if isinstance(problem_type, ProblemTypes): return problem_type - raise ValueError('`handle_problem_types` was not passed a str or ProblemTypes object') + raise ValueError( + "`handle_problem_types` was not passed a str or ProblemTypes object" + ) def detect_problem_type(y): @@ -46,7 +48,7 @@ def detect_problem_type(y): if num_classes == 2: return ProblemTypes.BINARY if is_numeric_dtype(y.dtype): - if (num_classes > 10): + if num_classes > 10: return ProblemTypes.REGRESSION return ProblemTypes.MULTICLASS @@ -58,9 +60,11 @@ def is_regression(problem_type): problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a regression problem type. -""" - return handle_problem_types(problem_type) in [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + bool: Whether or not the provided problem_type is a regression problem type.""" + return handle_problem_types(problem_type) in [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ] def is_binary(problem_type): @@ -70,9 +74,11 @@ def is_binary(problem_type): problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a binary classification problem type. -""" - return handle_problem_types(problem_type) in [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY] + bool: Whether or not the provided problem_type is a binary classification problem type.""" + return handle_problem_types(problem_type) in [ + ProblemTypes.BINARY, + ProblemTypes.TIME_SERIES_BINARY, + ] def is_multiclass(problem_type): @@ -82,9 +88,11 @@ def is_multiclass(problem_type): problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a multiclass classification problem type. -""" - return handle_problem_types(problem_type) in [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS] + bool: Whether or not the provided problem_type is a multiclass classification problem type.""" + return handle_problem_types(problem_type) in [ + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] def is_classification(problem_type): @@ -94,8 +102,7 @@ def is_classification(problem_type): problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a classification problem type. -""" + bool: Whether or not the provided problem_type is a classification problem type.""" return is_binary(problem_type) or is_multiclass(problem_type) @@ -106,8 +113,9 @@ def is_time_series(problem_type): problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. Returns: - bool: Whether or not the provided problem_type is a time series problem type. -""" - return handle_problem_types(problem_type) in [ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS, - ProblemTypes.TIME_SERIES_REGRESSION] + bool: Whether or not the provided problem_type is a time series problem type.""" + return handle_problem_types(problem_type) in [ + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_REGRESSION, + ] diff --git a/evalml/tests/automl_tests/dask_test_utils.py b/evalml/tests/automl_tests/dask_test_utils.py index 7a45f4a913..b874614957 100644 --- a/evalml/tests/automl_tests/dask_test_utils.py +++ b/evalml/tests/automl_tests/dask_test_utils.py @@ -19,19 +19,21 @@ def err_call(*args, **kwargs): optimize_thresholds = False error_callback = err_call random_seed = 0 -automl_data = AutoMLConfig(data_splitter=data_splitter, - problem_type=problem_type, - objective=objective, - additional_objectives=additional_objectives, - optimize_thresholds=optimize_thresholds, - error_callback=error_callback, - random_seed=random_seed, - X_schema=None, - y_schema=None) +automl_data = AutoMLConfig( + data_splitter=data_splitter, + problem_type=problem_type, + objective=objective, + additional_objectives=additional_objectives, + optimize_thresholds=optimize_thresholds, + error_callback=error_callback, + random_seed=random_seed, + X_schema=None, + y_schema=None, +) def delayed(delay): - """ Decorator to delay function evaluation. """ + """Decorator to delay function evaluation.""" def wrap(a_method): def do_delay(*args, **kw): @@ -48,7 +50,12 @@ class TestPipelineWithFitError(BinaryClassificationPipeline): custom_name = "PipelineWithError" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) @@ -66,27 +73,41 @@ class TestPipelineWithScoreError(BinaryClassificationPipeline): custom_name = "PipelineWithError" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def score(self, X, y, objectives): - raise PipelineScoreError(exceptions={"AUC": (Exception(), []), - "Log Loss Binary": (Exception(), [])}, - scored_successfully={"F1": 0.2, - "MCC Binary": 0.2, - "Precision": 0.8, - "Balanced Accuracy Binary": 0.2, - "Accuracy Binary": 0.2}) + raise PipelineScoreError( + exceptions={"AUC": (Exception(), []), "Log Loss Binary": (Exception(), [])}, + scored_successfully={ + "F1": 0.2, + "MCC Binary": 0.2, + "Precision": 0.8, + "Balanced Accuracy Binary": 0.2, + "Accuracy Binary": 0.2, + }, + ) class TestPipelineSlow(BinaryClassificationPipeline): - """ Pipeline for testing whose fit() should take longer than the + """Pipeline for testing whose fit() should take longer than the fast pipeline. This exists solely to test AutoMLSearch termination - and not complete fitting. """ + and not complete fitting.""" + component_graph = ["Baseline Classifier"] custom_name = "SlowPipeline" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) @@ -100,14 +121,20 @@ def fit(self, X, y): class TestPipelineFast(BinaryClassificationPipeline): - """ Pipeline for testing whose fit() should complete before the + """Pipeline for testing whose fit() should complete before the slow pipeline. This exists solely to test AutoMLSearch termination - and complete fitting. """ + and complete fitting.""" + component_graph = ["Baseline Classifier"] custom_name = "FastPipeline" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) @@ -121,17 +148,28 @@ def fit(self, X, y): class TestSchemaCheckPipeline(BinaryClassificationPipeline): - - def __init__(self, component_graph, parameters=None, custom_name=None, random_seed=0, - X_schema_to_check=None, y_schema_to_check=None): + def __init__( + self, + component_graph, + parameters=None, + custom_name=None, + random_seed=0, + X_schema_to_check=None, + y_schema_to_check=None, + ): self.X_schema_to_check = X_schema_to_check self.y_schema_to_check = y_schema_to_check super().__init__(component_graph, parameters, custom_name, random_seed) def clone(self): - return self.__class__(self.component_graph, parameters=self.parameters, custom_name=self.custom_name, - random_seed=self.random_seed, X_schema_to_check=self.X_schema_to_check, - y_schema_to_check=self.y_schema_to_check) + return self.__class__( + self.component_graph, + parameters=self.parameters, + custom_name=self.custom_name, + random_seed=self.random_seed, + X_schema_to_check=self.X_schema_to_check, + y_schema_to_check=self.y_schema_to_check, + ) def fit(self, X, y): assert X.ww.schema == self.X_schema_to_check diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index bb70aea88d..76388e95e6 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -16,19 +16,19 @@ from evalml.automl.callbacks import ( log_error_callback, raise_error_callback, - silent_error_callback + silent_error_callback, ) from evalml.automl.utils import ( _LARGE_DATA_PERCENT_VALIDATION, _LARGE_DATA_ROW_THRESHOLD, - get_default_primary_search_objective + get_default_primary_search_objective, ) from evalml.demos import load_breast_cancer, load_wine from evalml.exceptions import ( AutoMLSearchException, PipelineNotFoundError, PipelineNotYetFittedError, - PipelineScoreError + PipelineScoreError, ) from evalml.model_family import ModelFamily from evalml.objectives import ( @@ -36,13 +36,13 @@ BinaryClassificationObjective, CostBenefitMatrix, FraudCost, - RegressionObjective + RegressionObjective, ) from evalml.objectives.utils import ( get_all_objective_names, get_core_objectives, get_non_core_objectives, - get_objective + get_objective, ) from evalml.pipelines import ( BinaryClassificationPipeline, @@ -50,28 +50,38 @@ MulticlassClassificationPipeline, PipelineBase, RegressionPipeline, - StackedEnsembleClassifier -) -from evalml.pipelines.components.utils import ( - allowed_model_families, - get_estimators + StackedEnsembleClassifier, ) +from evalml.pipelines.components.utils import allowed_model_families, get_estimators from evalml.pipelines.utils import make_pipeline from evalml.preprocessing import TrainingValidationSplit from evalml.problem_types import ( ProblemTypes, handle_problem_types, is_classification, - is_time_series + is_time_series, ) from evalml.tuners import NoParamsException, RandomSearchTuner -@pytest.mark.parametrize("automl_type,objective", - zip([ProblemTypes.REGRESSION, ProblemTypes.MULTICLASS, ProblemTypes.BINARY, ProblemTypes.BINARY], - ['R2', 'log loss multiclass', 'log loss binary', 'F1'])) +@pytest.mark.parametrize( + "automl_type,objective", + zip( + [ + ProblemTypes.REGRESSION, + ProblemTypes.MULTICLASS, + ProblemTypes.BINARY, + ProblemTypes.BINARY, + ], + ["R2", "log loss multiclass", "log loss binary", "F1"], + ), +) def test_search_results(X_y_regression, X_y_binary, X_y_multi, automl_type, objective): - expected_cv_data_keys = {'all_objective_scores', "mean_cv_score", 'binary_classification_threshold'} + expected_cv_data_keys = { + "all_objective_scores", + "mean_cv_score", + "binary_classification_threshold", + } if automl_type == ProblemTypes.REGRESSION: expected_pipeline_class = RegressionPipeline X, y = X_y_regression @@ -82,57 +92,134 @@ def test_search_results(X_y_regression, X_y_binary, X_y_multi, automl_type, obje expected_pipeline_class = MulticlassClassificationPipeline X, y = X_y_multi - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, optimize_thresholds=False, objective=objective, max_iterations=2, n_jobs=1) - automl.search() - assert automl.results.keys() == {'pipeline_results', 'search_order'} - assert automl.results['search_order'] == [0, 1] - assert len(automl.results['pipeline_results']) == 2 - for pipeline_id, results in automl.results['pipeline_results'].items(): - assert results.keys() == {'id', 'pipeline_name', 'pipeline_class', 'pipeline_summary', 'parameters', "mean_cv_score", - "standard_deviation_cv_score", 'high_variance_cv', 'training_time', - 'cv_data', 'percent_better_than_baseline_all_objectives', - 'percent_better_than_baseline', 'validation_score'} - assert results['id'] == pipeline_id - assert isinstance(results['pipeline_name'], str) - assert issubclass(results['pipeline_class'], expected_pipeline_class) - assert isinstance(results['pipeline_summary'], str) - assert isinstance(results['parameters'], dict) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=automl_type, + optimize_thresholds=False, + objective=objective, + max_iterations=2, + n_jobs=1, + ) + automl.search() + assert automl.results.keys() == {"pipeline_results", "search_order"} + assert automl.results["search_order"] == [0, 1] + assert len(automl.results["pipeline_results"]) == 2 + for pipeline_id, results in automl.results["pipeline_results"].items(): + assert results.keys() == { + "id", + "pipeline_name", + "pipeline_class", + "pipeline_summary", + "parameters", + "mean_cv_score", + "standard_deviation_cv_score", + "high_variance_cv", + "training_time", + "cv_data", + "percent_better_than_baseline_all_objectives", + "percent_better_than_baseline", + "validation_score", + } + assert results["id"] == pipeline_id + assert isinstance(results["pipeline_name"], str) + assert issubclass(results["pipeline_class"], expected_pipeline_class) + assert isinstance(results["pipeline_summary"], str) + assert isinstance(results["parameters"], dict) assert isinstance(results["mean_cv_score"], float) - assert isinstance(results['high_variance_cv'], bool) - assert isinstance(results['cv_data'], list) - for cv_result in results['cv_data']: + assert isinstance(results["high_variance_cv"], bool) + assert isinstance(results["cv_data"], list) + for cv_result in results["cv_data"]: assert cv_result.keys() == expected_cv_data_keys - if objective == 'F1': - assert cv_result['binary_classification_threshold'] == 0.5 + if objective == "F1": + assert cv_result["binary_classification_threshold"] == 0.5 else: - assert cv_result['binary_classification_threshold'] is None + assert cv_result["binary_classification_threshold"] is None all_objective_scores = cv_result["all_objective_scores"] for score in all_objective_scores.values(): assert score is not None - assert automl.get_pipeline(pipeline_id).parameters == results['parameters'] - assert results['validation_score'] == pd.Series([fold["mean_cv_score"] for fold in results['cv_data']])[0] + assert automl.get_pipeline(pipeline_id).parameters == results["parameters"] + assert ( + results["validation_score"] + == pd.Series([fold["mean_cv_score"] for fold in results["cv_data"]])[0] + ) assert isinstance(automl.rankings, pd.DataFrame) assert isinstance(automl.full_rankings, pd.DataFrame) - assert np.all(automl.rankings.dtypes == pd.Series( - [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('float64'), np.dtype('float64'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')], - index=['id', 'pipeline_name', "mean_cv_score", "standard_deviation_cv_score", "validation_score", 'percent_better_than_baseline', 'high_variance_cv', 'parameters'])) - assert np.all(automl.full_rankings.dtypes == pd.Series( - [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('float64'), np.dtype('float64'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')], - index=['id', 'pipeline_name', "mean_cv_score", "standard_deviation_cv_score", "validation_score", 'percent_better_than_baseline', 'high_variance_cv', 'parameters'])) - - -@pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_pipeline_limits(mock_fit_binary, mock_score_binary, - mock_fit_multi, mock_score_multi, - mock_fit_regression, mock_score_regression, - automl_type, caplog, - X_y_binary, X_y_multi, X_y_regression): + assert np.all( + automl.rankings.dtypes + == pd.Series( + [ + np.dtype("int64"), + np.dtype("O"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("bool"), + np.dtype("O"), + ], + index=[ + "id", + "pipeline_name", + "mean_cv_score", + "standard_deviation_cv_score", + "validation_score", + "percent_better_than_baseline", + "high_variance_cv", + "parameters", + ], + ) + ) + assert np.all( + automl.full_rankings.dtypes + == pd.Series( + [ + np.dtype("int64"), + np.dtype("O"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("float64"), + np.dtype("bool"), + np.dtype("O"), + ], + index=[ + "id", + "pipeline_name", + "mean_cv_score", + "standard_deviation_cv_score", + "validation_score", + "percent_better_than_baseline", + "high_variance_cv", + "parameters", + ], + ) + ) + + +@pytest.mark.parametrize( + "automl_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_pipeline_limits( + mock_fit_binary, + mock_score_binary, + mock_fit_multi, + mock_score_multi, + mock_fit_regression, + mock_score_regression, + automl_type, + caplog, + X_y_binary, + X_y_multi, + X_y_regression, +): if automl_type == ProblemTypes.BINARY: X, y = X_y_binary elif automl_type == ProblemTypes.MULTICLASS: @@ -140,30 +227,34 @@ def test_pipeline_limits(mock_fit_binary, mock_score_binary, elif automl_type == ProblemTypes.REGRESSION: X, y = X_y_regression - mock_score_binary.return_value = {'Log Loss Binary': 1.0} - mock_score_multi.return_value = {'Log Loss Multiclass': 1.0} - mock_score_regression.return_value = {'R2': 1.0} + mock_score_binary.return_value = {"Log Loss Binary": 1.0} + mock_score_multi.return_value = {"Log Loss Multiclass": 1.0} + mock_score_regression.return_value = {"R2": 1.0} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, max_iterations=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type=automl_type, max_iterations=1 + ) automl.search() out = caplog.text assert "Searching up to 1 pipelines. " in out - assert len(automl.results['pipeline_results']) == 1 + assert len(automl.results["pipeline_results"]) == 1 caplog.clear() automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, max_time=1) automl.search() out = caplog.text assert "Will stop searching for new pipelines after 1 seconds" in out - assert len(automl.results['pipeline_results']) >= 1 + assert len(automl.results["pipeline_results"]) >= 1 caplog.clear() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, max_time=1, max_iterations=5) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type=automl_type, max_time=1, max_iterations=5 + ) automl.search() out = caplog.text assert "Searching up to 5 pipelines. " in out assert "Will stop searching for new pipelines after 1 seconds" in out - assert len(automl.results['pipeline_results']) <= 5 + assert len(automl.results["pipeline_results"]) <= 5 caplog.clear() automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type) @@ -171,136 +262,173 @@ def test_pipeline_limits(mock_fit_binary, mock_score_binary, out = caplog.text assert "Using default limit of max_batches=1." in out assert "Searching up to 1 batches for a total of" in out - assert len(automl.results['pipeline_results']) > 5 + assert len(automl.results["pipeline_results"]) > 5 caplog.clear() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, max_time=1e-16) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type=automl_type, max_time=1e-16 + ) automl.search() out = caplog.text assert "Will stop searching for new pipelines after 0 seconds" in out # search will always run at least one pipeline - assert len(automl.results['pipeline_results']) >= 1 + assert len(automl.results["pipeline_results"]) >= 1 -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_pipeline_fit_raises(mock_fit, X_y_binary, caplog): - msg = 'all your model are belong to us' + msg = "all your model are belong to us" mock_fit.side_effect = Exception(msg) X, y = X_y_binary # Don't train the best pipeline, since this test mocks the pipeline.fit() method and causes it to raise an exception, # which we don't want to raise while fitting the best pipeline. - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, train_best_pipeline=False) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + train_best_pipeline=False, + ) automl.search() out = caplog.text - assert 'Exception during automl search' in out - pipeline_results = automl.results.get('pipeline_results', {}) + assert "Exception during automl search" in out + pipeline_results = automl.results.get("pipeline_results", {}) assert len(pipeline_results) == 1 - cv_scores_all = pipeline_results[0].get('cv_data', {}) + cv_scores_all = pipeline_results[0].get("cv_data", {}) for cv_scores in cv_scores_all: - for name, score in cv_scores['all_objective_scores'].items(): - if name in ['# Training', '# Validation']: + for name, score in cv_scores["all_objective_scores"].items(): + if name in ["# Training", "# Validation"]: assert score > 0 else: assert np.isnan(score) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") def test_pipeline_score_raises(mock_score, X_y_binary, caplog): - msg = 'all your model are belong to us' + msg = "all your model are belong to us" mock_score.side_effect = Exception(msg) X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_iterations=1, n_jobs=1 + ) automl.search() out = caplog.text - assert 'Exception during automl search' in out - assert 'All scores will be replaced with nan.' in out - pipeline_results = automl.results.get('pipeline_results', {}) + assert "Exception during automl search" in out + assert "All scores will be replaced with nan." in out + pipeline_results = automl.results.get("pipeline_results", {}) assert len(pipeline_results) == 1 cv_scores_all = pipeline_results[0]["cv_data"][0]["all_objective_scores"] - objective_scores = {o.name: cv_scores_all[o.name] for o in [automl.objective] + automl.additional_objectives} + objective_scores = { + o.name: cv_scores_all[o.name] + for o in [automl.objective] + automl.additional_objectives + } assert np.isnan(list(objective_scores.values())).all() -@patch('evalml.objectives.AUC.score') +@patch("evalml.objectives.AUC.score") def test_objective_score_raises(mock_score, X_y_binary, caplog): - msg = 'all your model are belong to us' + msg = "all your model are belong to us" mock_score.side_effect = Exception(msg) X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_iterations=1, n_jobs=1 + ) automl.search() out = caplog.text assert msg in out - pipeline_results = automl.results.get('pipeline_results') + pipeline_results = automl.results.get("pipeline_results") assert len(pipeline_results) == 1 - cv_scores_all = pipeline_results[0].get('cv_data') - scores = cv_scores_all[0]['all_objective_scores'] - auc_score = scores.pop('AUC') + cv_scores_all = pipeline_results[0].get("cv_data") + scores = cv_scores_all[0]["all_objective_scores"] + auc_score = scores.pop("AUC") assert np.isnan(auc_score) assert not np.isnan(list(scores.values())).any() def test_rankings(X_y_binary, X_y_regression): X, y = X_y_binary - model_families = ['random_forest'] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_model_families=model_families, - max_iterations=3, n_jobs=1) + model_families = ["random_forest"] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_model_families=model_families, + max_iterations=3, + n_jobs=1, + ) automl.search() assert len(automl.full_rankings) == 3 assert len(automl.rankings) == 2 X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_model_families=model_families, max_iterations=3, - n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_model_families=model_families, + max_iterations=3, + n_jobs=1, + ) automl.search() assert len(automl.full_rankings) == 3 assert len(automl.rankings) == 2 -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets', side_effect=lambda y: y) -@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_str_search(mock_fit, mock_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, X_y_binary): +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") +@patch( + "evalml.pipelines.BinaryClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_str_search( + mock_fit, + mock_score, + mock_predict_proba, + mock_encode_targets, + mock_optimize_threshold, + X_y_binary, +): def _dummy_callback(pipeline, automl_obj): return None X, y = X_y_binary search_params = { - 'problem_type': 'binary', - 'objective': 'F1', - 'max_time': 100, - 'max_iterations': 5, - 'patience': 2, - 'tolerance': 0.5, - 'allowed_model_families': ['random_forest', 'linear_model'], - 'data_splitter': StratifiedKFold(n_splits=5), - 'tuner_class': RandomSearchTuner, - 'start_iteration_callback': _dummy_callback, - 'add_result_callback': None, - 'additional_objectives': ['Precision', 'AUC'], - 'n_jobs': 2, - 'optimize_thresholds': True + "problem_type": "binary", + "objective": "F1", + "max_time": 100, + "max_iterations": 5, + "patience": 2, + "tolerance": 0.5, + "allowed_model_families": ["random_forest", "linear_model"], + "data_splitter": StratifiedKFold(n_splits=5), + "tuner_class": RandomSearchTuner, + "start_iteration_callback": _dummy_callback, + "add_result_callback": None, + "additional_objectives": ["Precision", "AUC"], + "n_jobs": 2, + "optimize_thresholds": True, } param_str_reps = { - 'Objective': search_params['objective'], - 'Max Time': search_params['max_time'], - 'Max Iterations': search_params['max_iterations'], - 'Allowed Pipelines': [], - 'Patience': search_params['patience'], - 'Tolerance': search_params['tolerance'], - 'Data Splitting': 'StratifiedKFold(n_splits=5, random_state=None, shuffle=False)', - 'Tuner': 'RandomSearchTuner', - 'Start Iteration Callback': '_dummy_callback', - 'Add Result Callback': None, - 'Additional Objectives': search_params['additional_objectives'], - 'Random Seed': 0, - 'n_jobs': search_params['n_jobs'], - 'Optimize Thresholds': search_params['optimize_thresholds'] + "Objective": search_params["objective"], + "Max Time": search_params["max_time"], + "Max Iterations": search_params["max_iterations"], + "Allowed Pipelines": [], + "Patience": search_params["patience"], + "Tolerance": search_params["tolerance"], + "Data Splitting": "StratifiedKFold(n_splits=5, random_state=None, shuffle=False)", + "Tuner": "RandomSearchTuner", + "Start Iteration Callback": "_dummy_callback", + "Add Result Callback": None, + "Additional Objectives": search_params["additional_objectives"], + "Random Seed": 0, + "n_jobs": search_params["n_jobs"], + "Optimize Thresholds": search_params["optimize_thresholds"], } automl = AutoMLSearch(X_train=X, y_train=y, **search_params) @@ -327,34 +455,35 @@ def _dummy_callback(pipeline, automl_obj): str_rep = str(automl) assert "Search Results:" in str_rep - assert automl.rankings.drop(['parameters'], axis='columns').to_string() in str_rep + assert automl.rankings.drop(["parameters"], axis="columns").to_string() in str_rep def test_automl_str_no_param_search(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") param_str_reps = { - 'Objective': 'Log Loss Binary', - 'Max Time': 'None', - 'Max Iterations': 'None', - 'Allowed Pipelines': [], - 'Patience': 'None', - 'Tolerance': '0.0', - 'Data Splitting': 'StratifiedKFold(n_splits=5, random_state=None, shuffle=False)', - 'Tuner': 'SKOptTuner', - 'Additional Objectives': [ - 'AUC', - 'Accuracy Binary', - 'Balanced Accuracy Binary', - 'F1', - 'MCC Binary', - 'Precision'], - 'Start Iteration Callback': 'None', - 'Add Result Callback': 'None', - 'Random Seed': 0, - 'n_jobs': '-1', - 'Optimize Thresholds': 'False' + "Objective": "Log Loss Binary", + "Max Time": "None", + "Max Iterations": "None", + "Allowed Pipelines": [], + "Patience": "None", + "Tolerance": "0.0", + "Data Splitting": "StratifiedKFold(n_splits=5, random_state=None, shuffle=False)", + "Tuner": "SKOptTuner", + "Additional Objectives": [ + "AUC", + "Accuracy Binary", + "Balanced Accuracy Binary", + "F1", + "MCC Binary", + "Precision", + ], + "Start Iteration Callback": "None", + "Add Result Callback": "None", + "Random Seed": 0, + "n_jobs": "-1", + "Optimize Thresholds": "False", } str_rep = str(automl) @@ -366,14 +495,17 @@ def test_automl_str_no_param_search(X_y_binary): assert "Search Results" not in str_rep -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_automl_feature_selection(mock_fit, mock_score, X_y_binary): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} class MockFeatureSelectionPipeline(BinaryClassificationPipeline): - component_graph = ['RF Classifier Select From Model', 'Logistic Regression Classifier'] + component_graph = [ + "RF Classifier Select From Model", + "Logistic Regression Classifier", + ] def __init__(self, parameters, random_seed=0): super().__init__(self.component_graph, parameters=parameters) @@ -389,221 +521,336 @@ def fit(self, X, y): allowed_pipelines = [MockFeatureSelectionPipeline({})] start_iteration_callback = MagicMock() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=2, - start_iteration_callback=start_iteration_callback, allowed_pipelines=allowed_pipelines) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=2, + start_iteration_callback=start_iteration_callback, + allowed_pipelines=allowed_pipelines, + ) automl.search() assert start_iteration_callback.call_count == 2 proposed_parameters = start_iteration_callback.call_args_list[1][0][0].parameters - assert proposed_parameters.keys() == {'RF Classifier Select From Model', 'Logistic Regression Classifier'} - assert proposed_parameters['RF Classifier Select From Model']['number_features'] == X.shape[1] + assert proposed_parameters.keys() == { + "RF Classifier Select From Model", + "Logistic Regression Classifier", + } + assert ( + proposed_parameters["RF Classifier Select From Model"]["number_features"] + == X.shape[1] + ) -@patch('evalml.tuners.random_search_tuner.RandomSearchTuner.is_search_space_exhausted') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_tuner_exception(mock_fit, mock_score, mock_is_search_space_exhausted, X_y_binary): - mock_score.return_value = {'Log Loss Binary': 1.0} +@patch("evalml.tuners.random_search_tuner.RandomSearchTuner.is_search_space_exhausted") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_tuner_exception( + mock_fit, mock_score, mock_is_search_space_exhausted, X_y_binary +): + mock_score.return_value = {"Log Loss Binary": 1.0} X, y = X_y_binary error_text = "Cannot create a unique set of unexplored parameters. Try expanding the search space." mock_is_search_space_exhausted.side_effect = NoParamsException(error_text) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", tuner_class=RandomSearchTuner, max_iterations=10) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + objective="R2", + tuner_class=RandomSearchTuner, + max_iterations=10, + ) with pytest.raises(NoParamsException, match=error_text): automl.search() -@patch('evalml.automl.automl_algorithm.IterativeAlgorithm.next_batch') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.automl.automl_algorithm.IterativeAlgorithm.next_batch") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_automl_algorithm(mock_fit, mock_score, mock_algo_next_batch, X_y_binary): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} mock_algo_next_batch.side_effect = StopIteration("that's all, folks") - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=5) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=5) automl.search() mock_fit.assert_called() mock_score.assert_called() assert mock_algo_next_batch.call_count == 1 - pipeline_results = automl.results.get('pipeline_results', {}) + pipeline_results = automl.results.get("pipeline_results", {}) assert len(pipeline_results) == 1 assert pipeline_results[0].get("mean_cv_score") == 1.0 -@patch('evalml.automl.automl_algorithm.IterativeAlgorithm.__init__') -def test_automl_allowed_pipelines_algorithm(mock_algo_init, dummy_binary_pipeline_class, X_y_binary): - mock_algo_init.side_effect = Exception('mock algo init') +@patch("evalml.automl.automl_algorithm.IterativeAlgorithm.__init__") +def test_automl_allowed_pipelines_algorithm( + mock_algo_init, dummy_binary_pipeline_class, X_y_binary +): + mock_algo_init.side_effect = Exception("mock algo init") X, y = X_y_binary allowed_pipelines = [dummy_binary_pipeline_class({})] - with pytest.raises(Exception, match='mock algo init'): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_pipelines=allowed_pipelines, max_iterations=10) + with pytest.raises(Exception, match="mock algo init"): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_pipelines=allowed_pipelines, + max_iterations=10, + ) assert mock_algo_init.call_count == 1 _, kwargs = mock_algo_init.call_args - assert kwargs['max_iterations'] == 10 - assert kwargs['allowed_pipelines'] == allowed_pipelines + assert kwargs["max_iterations"] == 10 + assert kwargs["allowed_pipelines"] == allowed_pipelines allowed_model_families = [ModelFamily.RANDOM_FOREST] - with pytest.raises(Exception, match='mock algo init'): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_model_families=allowed_model_families, max_iterations=1) + with pytest.raises(Exception, match="mock algo init"): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_model_families=allowed_model_families, + max_iterations=1, + ) assert mock_algo_init.call_count == 2 _, kwargs = mock_algo_init.call_args - assert kwargs['max_iterations'] == 1 - for actual, expected in zip(kwargs['allowed_pipelines'], [make_pipeline(X, y, estimator, ProblemTypes.BINARY) for estimator in get_estimators(ProblemTypes.BINARY, model_families=allowed_model_families)]): + assert kwargs["max_iterations"] == 1 + for actual, expected in zip( + kwargs["allowed_pipelines"], + [ + make_pipeline(X, y, estimator, ProblemTypes.BINARY) + for estimator in get_estimators( + ProblemTypes.BINARY, model_families=allowed_model_families + ) + ], + ): assert actual.parameters == expected.parameters def test_automl_serialization(X_y_binary, tmpdir): X, y = X_y_binary - path = os.path.join(str(tmpdir), 'automl.pkl') + path = os.path.join(str(tmpdir), "automl.pkl") num_max_iterations = 5 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=num_max_iterations, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=num_max_iterations, + n_jobs=1, + ) automl.search() automl.save(path) loaded_automl = automl.load(path) for i in range(num_max_iterations): - assert automl.get_pipeline(i).__class__ == loaded_automl.get_pipeline(i).__class__ - assert automl.get_pipeline(i).parameters == loaded_automl.get_pipeline(i).parameters - - for id_, pipeline_results in automl.results['pipeline_results'].items(): - loaded_ = loaded_automl.results['pipeline_results'][id_] + assert ( + automl.get_pipeline(i).__class__ == loaded_automl.get_pipeline(i).__class__ + ) + assert ( + automl.get_pipeline(i).parameters + == loaded_automl.get_pipeline(i).parameters + ) + + for id_, pipeline_results in automl.results["pipeline_results"].items(): + loaded_ = loaded_automl.results["pipeline_results"][id_] for name in pipeline_results: # Use np to check percent_better_than_baseline because of (possible) nans - if name == 'percent_better_than_baseline_all_objectives': + if name == "percent_better_than_baseline_all_objectives": for objective_name, value in pipeline_results[name].items(): - np.testing.assert_almost_equal(value, loaded_[name][objective_name]) - elif name == 'percent_better_than_baseline': - np.testing.assert_almost_equal(pipeline_results[name], loaded_[name]) + np.testing.assert_almost_equal( + value, loaded_[name][objective_name] + ) + elif name == "percent_better_than_baseline": + np.testing.assert_almost_equal( + pipeline_results[name], loaded_[name] + ) else: assert pipeline_results[name] == loaded_[name] pd.testing.assert_frame_equal(automl.rankings, loaded_automl.rankings) -@patch('cloudpickle.dump') +@patch("cloudpickle.dump") def test_automl_serialization_protocol(mock_cloudpickle_dump, tmpdir, X_y_binary): X, y = X_y_binary - path = os.path.join(str(tmpdir), 'automl.pkl') - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=5, n_jobs=1) + path = os.path.join(str(tmpdir), "automl.pkl") + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_iterations=5, n_jobs=1 + ) automl.save(path) assert len(mock_cloudpickle_dump.call_args_list) == 1 - assert mock_cloudpickle_dump.call_args_list[0][1]['protocol'] == cloudpickle.DEFAULT_PROTOCOL + assert ( + mock_cloudpickle_dump.call_args_list[0][1]["protocol"] + == cloudpickle.DEFAULT_PROTOCOL + ) mock_cloudpickle_dump.reset_mock() automl.save(path, pickle_protocol=42) assert len(mock_cloudpickle_dump.call_args_list) == 1 - assert mock_cloudpickle_dump.call_args_list[0][1]['protocol'] == 42 + assert mock_cloudpickle_dump.call_args_list[0][1]["protocol"] == 42 def test_invalid_data_splitter(X_y_binary): X, y = X_y_binary data_splitter = pd.DataFrame() - with pytest.raises(ValueError, match='Not a valid data splitter'): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=data_splitter) + with pytest.raises(ValueError, match="Not a valid data splitter"): + AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", data_splitter=data_splitter + ) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") def test_large_dataset_binary(mock_score): - X = pd.DataFrame({'col_0': [i for i in range(101000)]}) + X = pd.DataFrame({"col_0": [i for i in range(101000)]}) y = pd.Series([i % 2 for i in range(101000)]) - fraud_objective = FraudCost(amount_col='col_0') - - automl = AutoMLSearch(X_train=X, y_train=y, - problem_type='binary', - objective=fraud_objective, - additional_objectives=['auc', 'f1', 'precision'], - max_time=1, - max_iterations=1, - optimize_thresholds=True, - n_jobs=1) + fraud_objective = FraudCost(amount_col="col_0") + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=fraud_objective, + additional_objectives=["auc", "f1", "precision"], + max_time=1, + max_iterations=1, + optimize_thresholds=True, + n_jobs=1, + ) mock_score.return_value = {automl.objective.name: 1.234} automl.search() assert isinstance(automl.data_splitter, TrainingValidationSplit) assert automl.data_splitter.get_n_splits() == 1 - for pipeline_id in automl.results['search_order']: - assert len(automl.results['pipeline_results'][pipeline_id]['cv_data']) == 1 - assert automl.results['pipeline_results'][pipeline_id]['cv_data'][0]["mean_cv_score"] == 1.234 - assert automl.results['pipeline_results'][pipeline_id]["mean_cv_score"] == automl.results['pipeline_results'][pipeline_id]['validation_score'] - - -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') + for pipeline_id in automl.results["search_order"]: + assert len(automl.results["pipeline_results"][pipeline_id]["cv_data"]) == 1 + assert ( + automl.results["pipeline_results"][pipeline_id]["cv_data"][0][ + "mean_cv_score" + ] + == 1.234 + ) + assert ( + automl.results["pipeline_results"][pipeline_id]["mean_cv_score"] + == automl.results["pipeline_results"][pipeline_id]["validation_score"] + ) + + +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") def test_large_dataset_multiclass(mock_score): - X = pd.DataFrame({'col_0': [i for i in range(101000)]}) + X = pd.DataFrame({"col_0": [i for i in range(101000)]}) y = pd.Series([i % 4 for i in range(101000)]) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', max_time=1, max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + max_time=1, + max_iterations=1, + n_jobs=1, + ) mock_score.return_value = {automl.objective.name: 1.234} automl.search() assert isinstance(automl.data_splitter, TrainingValidationSplit) assert automl.data_splitter.get_n_splits() == 1 - for pipeline_id in automl.results['search_order']: - assert len(automl.results['pipeline_results'][pipeline_id]['cv_data']) == 1 - assert automl.results['pipeline_results'][pipeline_id]['cv_data'][0]["mean_cv_score"] == 1.234 - assert automl.results['pipeline_results'][pipeline_id]["mean_cv_score"] == automl.results['pipeline_results'][pipeline_id]['validation_score'] - - -@patch('evalml.pipelines.RegressionPipeline.score') + for pipeline_id in automl.results["search_order"]: + assert len(automl.results["pipeline_results"][pipeline_id]["cv_data"]) == 1 + assert ( + automl.results["pipeline_results"][pipeline_id]["cv_data"][0][ + "mean_cv_score" + ] + == 1.234 + ) + assert ( + automl.results["pipeline_results"][pipeline_id]["mean_cv_score"] + == automl.results["pipeline_results"][pipeline_id]["validation_score"] + ) + + +@patch("evalml.pipelines.RegressionPipeline.score") def test_large_dataset_regression(mock_score): - X = pd.DataFrame({'col_0': [i for i in range(101000)]}) + X = pd.DataFrame({"col_0": [i for i in range(101000)]}) y = pd.Series([i for i in range(101000)]) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_time=1, max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + max_time=1, + max_iterations=1, + n_jobs=1, + ) mock_score.return_value = {automl.objective.name: 1.234} automl.search() assert isinstance(automl.data_splitter, TrainingValidationSplit) assert automl.data_splitter.get_n_splits() == 1 - for pipeline_id in automl.results['search_order']: - assert len(automl.results['pipeline_results'][pipeline_id]['cv_data']) == 1 - assert automl.results['pipeline_results'][pipeline_id]['cv_data'][0]["mean_cv_score"] == 1.234 - assert automl.results['pipeline_results'][pipeline_id]["mean_cv_score"] == automl.results['pipeline_results'][pipeline_id]['validation_score'] + for pipeline_id in automl.results["search_order"]: + assert len(automl.results["pipeline_results"][pipeline_id]["cv_data"]) == 1 + assert ( + automl.results["pipeline_results"][pipeline_id]["cv_data"][0][ + "mean_cv_score" + ] + == 1.234 + ) + assert ( + automl.results["pipeline_results"][pipeline_id]["mean_cv_score"] + == automl.results["pipeline_results"][pipeline_id]["validation_score"] + ) def test_large_dataset_split_size(X_y_binary): X, y = X_y_binary def generate_fake_dataset(rows): - X = pd.DataFrame({'col_0': [i for i in range(rows)]}) + X = pd.DataFrame({"col_0": [i for i in range(rows)]}) y = pd.Series([i % 2 for i in range(rows)]) return X, y - fraud_objective = FraudCost(amount_col='col_0') - - automl = AutoMLSearch(X_train=X, y_train=y, - problem_type='binary', - objective=fraud_objective, - additional_objectives=['auc', 'f1', 'precision'], - max_time=1, - max_iterations=1, - optimize_thresholds=True) + fraud_objective = FraudCost(amount_col="col_0") + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=fraud_objective, + additional_objectives=["auc", "f1", "precision"], + max_time=1, + max_iterations=1, + optimize_thresholds=True, + ) assert isinstance(automl.data_splitter, StratifiedKFold) under_max_rows = _LARGE_DATA_ROW_THRESHOLD - 1 X, y = generate_fake_dataset(under_max_rows) - automl = AutoMLSearch(X_train=X, y_train=y, - problem_type='binary', - objective=fraud_objective, - additional_objectives=['auc', 'f1', 'precision'], - max_time=1, - max_iterations=1, - optimize_thresholds=True) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=fraud_objective, + additional_objectives=["auc", "f1", "precision"], + max_time=1, + max_iterations=1, + optimize_thresholds=True, + ) assert isinstance(automl.data_splitter, StratifiedKFold) automl.data_splitter = None over_max_rows = _LARGE_DATA_ROW_THRESHOLD + 1 X, y = generate_fake_dataset(over_max_rows) - automl = AutoMLSearch(X_train=X, y_train=y, - problem_type='binary', - objective=fraud_objective, - additional_objectives=['auc', 'f1', 'precision'], - max_time=1, - max_iterations=1, - optimize_thresholds=True) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=fraud_objective, + additional_objectives=["auc", "f1", "precision"], + max_time=1, + max_iterations=1, + optimize_thresholds=True, + ) assert isinstance(automl.data_splitter, TrainingValidationSplit) assert automl.data_splitter.test_size == (_LARGE_DATA_PERCENT_VALIDATION) @@ -620,77 +867,114 @@ def test_data_splitter_shuffle(): # thus yielding an R^2 well below 0. n = 100000 - X = pd.DataFrame({'col_0': np.random.random(n)}) - y = pd.Series(np.arange(n), name='target') - automl = AutoMLSearch(X_train=X, y_train=y, - problem_type='regression', - max_time=1, - max_iterations=1, - n_jobs=1) - automl.search() - assert automl.results['search_order'] == [0] - assert len(automl.results['pipeline_results'][0]['cv_data']) == 3 + X = pd.DataFrame({"col_0": np.random.random(n)}) + y = pd.Series(np.arange(n), name="target") + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + max_time=1, + max_iterations=1, + n_jobs=1, + ) + automl.search() + assert automl.results["search_order"] == [0] + assert len(automl.results["pipeline_results"][0]["cv_data"]) == 3 for fold in range(3): - np.testing.assert_almost_equal(automl.results['pipeline_results'][0]['cv_data'][fold]["mean_cv_score"], 0.0, decimal=4) - np.testing.assert_almost_equal(automl.results['pipeline_results'][0]["mean_cv_score"], 0.0, decimal=4) - np.testing.assert_almost_equal(automl.results['pipeline_results'][0]['validation_score'], 0.0, decimal=4) - - -def test_allowed_pipelines_with_incorrect_problem_type(dummy_binary_pipeline_class, X_y_binary): + np.testing.assert_almost_equal( + automl.results["pipeline_results"][0]["cv_data"][fold]["mean_cv_score"], + 0.0, + decimal=4, + ) + np.testing.assert_almost_equal( + automl.results["pipeline_results"][0]["mean_cv_score"], 0.0, decimal=4 + ) + np.testing.assert_almost_equal( + automl.results["pipeline_results"][0]["validation_score"], 0.0, decimal=4 + ) + + +def test_allowed_pipelines_with_incorrect_problem_type( + dummy_binary_pipeline_class, X_y_binary +): X, y = X_y_binary # checks that not setting allowed_pipelines does not error out - AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + AutoMLSearch(X_train=X, y_train=y, problem_type="binary") with pytest.raises(ValueError, match="is not compatible with problem_type"): - AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=[dummy_binary_pipeline_class({})]) + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) def test_main_objective_problem_type_mismatch(X_y_binary): X, y = X_y_binary with pytest.raises(ValueError, match="is not compatible with a"): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='R2') + AutoMLSearch(X_train=X, y_train=y, problem_type="binary", objective="R2") with pytest.raises(ValueError, match="is not compatible with a"): - AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective='MCC Binary') + AutoMLSearch( + X_train=X, y_train=y, problem_type="regression", objective="MCC Binary" + ) with pytest.raises(ValueError, match="is not compatible with a"): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='MCC Multiclass') + AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", objective="MCC Multiclass" + ) with pytest.raises(ValueError, match="is not compatible with a"): - AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective='MSE') + AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass", objective="MSE") def test_init_missing_data(X_y_binary): X, y = X_y_binary - with pytest.raises(ValueError, match=r"Must specify training data as a 2d array using the X_train argument"): - AutoMLSearch(y_train=y, problem_type='binary') + with pytest.raises( + ValueError, + match=r"Must specify training data as a 2d array using the X_train argument", + ): + AutoMLSearch(y_train=y, problem_type="binary") - with pytest.raises(ValueError, match=r"Must specify training data target values as a 1d vector using the y_train argument"): - AutoMLSearch(X_train=X, problem_type='binary') + with pytest.raises( + ValueError, + match=r"Must specify training data target values as a 1d vector using the y_train argument", + ): + AutoMLSearch(X_train=X, problem_type="binary") def test_init_problem_type_error(X_y_binary): X, y = X_y_binary - with pytest.raises(ValueError, match=r"choose one of \(binary, multiclass, regression\) as problem_type"): + with pytest.raises( + ValueError, + match=r"choose one of \(binary, multiclass, regression\) as problem_type", + ): AutoMLSearch(X_train=X, y_train=y) with pytest.raises(KeyError, match=r"does not exist"): - AutoMLSearch(X_train=X, y_train=y, problem_type='multi') + AutoMLSearch(X_train=X, y_train=y, problem_type="multi") def test_init_objective(X_y_binary): X, y = X_y_binary - defaults = {'multiclass': 'Log Loss Multiclass', 'binary': 'Log Loss Binary', 'regression': 'R2'} + defaults = { + "multiclass": "Log Loss Multiclass", + "binary": "Log Loss Binary", + "regression": "R2", + } for problem_type in defaults: error_automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type) assert error_automl.objective.name == defaults[problem_type] -@patch('evalml.automl.automl_search.AutoMLSearch.search') +@patch("evalml.automl.automl_search.AutoMLSearch.search") def test_checks_at_search_time(mock_search, dummy_regression_pipeline_class, X_y_multi): X, y = X_y_multi error_text = "in search, problem_type mismatches label type." mock_search.side_effect = ValueError(error_text) - error_automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2") + error_automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="regression", objective="R2" + ) with pytest.raises(ValueError, match=error_text): error_automl.search() @@ -698,14 +982,21 @@ def test_checks_at_search_time(mock_search, dummy_regression_pipeline_class, X_y def test_incompatible_additional_objectives(X_y_binary): X, y = X_y_binary with pytest.raises(ValueError, match="is not compatible with a "): - AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', additional_objectives=['Precision', 'AUC']) + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + additional_objectives=["Precision", "AUC"], + ) def test_default_objective(X_y_binary): X, y = X_y_binary - correct_matches = {ProblemTypes.MULTICLASS: 'Log Loss Multiclass', - ProblemTypes.BINARY: 'Log Loss Binary', - ProblemTypes.REGRESSION: 'R2'} + correct_matches = { + ProblemTypes.MULTICLASS: "Log Loss Multiclass", + ProblemTypes.BINARY: "Log Loss Binary", + ProblemTypes.REGRESSION: "R2", + } for problem_type in correct_matches: automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type) assert automl.objective.name == correct_matches[problem_type] @@ -714,21 +1005,26 @@ def test_default_objective(X_y_binary): assert automl.objective.name == correct_matches[problem_type] -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})]) + mock_score.return_value = {"Log Loss Binary": 1.0} + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) automl.search() assert len(automl.rankings) == 1 assert len(automl.full_rankings) == 1 original_best_pipeline = automl.best_pipeline assert original_best_pipeline is not None - mock_score.return_value = {'Log Loss Binary': 0.1234} + mock_score.return_value = {"Log Loss Binary": 0.1234} test_pipeline = dummy_binary_pipeline_class(parameters={}) automl.add_to_rankings(test_pipeline) assert automl.best_pipeline.name == test_pipeline.name @@ -738,8 +1034,10 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_ assert len(automl.full_rankings) == 2 assert 0.1234 in automl.rankings["mean_cv_score"].values - mock_score.return_value = {'Log Loss Binary': 0.5678} - test_pipeline_2 = dummy_binary_pipeline_class(parameters={'Mock Classifier': {'a': 1.234}}) + mock_score.return_value = {"Log Loss Binary": 0.5678} + test_pipeline_2 = dummy_binary_pipeline_class( + parameters={"Mock Classifier": {"a": 1.234}} + ) automl.add_to_rankings(test_pipeline_2) assert automl.best_pipeline.name == test_pipeline.name assert automl.best_pipeline.parameters == test_pipeline.parameters @@ -750,14 +1048,21 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_ assert 0.5678 in automl.full_rankings["mean_cv_score"].values -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_add_to_rankings_no_search(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_add_to_rankings_no_search( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})]) - - mock_score.return_value = {'Log Loss Binary': 0.5234} + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) + + mock_score.return_value = {"Log Loss Binary": 0.5234} test_pipeline = dummy_binary_pipeline_class(parameters={}) automl.add_to_rankings(test_pipeline) @@ -766,17 +1071,31 @@ def test_add_to_rankings_no_search(mock_fit, mock_score, dummy_binary_pipeline_c assert isinstance(automl.data_splitter, StratifiedKFold) assert len(automl.rankings) == 1 assert 0.5234 in automl.rankings["mean_cv_score"].values - assert np.isnan(automl.results['pipeline_results'][0]['percent_better_than_baseline']) - assert all(np.isnan(res) for res in automl.results['pipeline_results'][0]['percent_better_than_baseline_all_objectives'].values()) - - -@patch('evalml.pipelines.RegressionPipeline.score') + assert np.isnan( + automl.results["pipeline_results"][0]["percent_better_than_baseline"] + ) + assert all( + np.isnan(res) + for res in automl.results["pipeline_results"][0][ + "percent_better_than_baseline_all_objectives" + ].values() + ) + + +@patch("evalml.pipelines.RegressionPipeline.score") def test_add_to_rankings_regression_large(mock_score, dummy_regression_pipeline_class): - X = pd.DataFrame({'col_0': [i for i in range(101000)]}) + X = pd.DataFrame({"col_0": [i for i in range(101000)]}) y = pd.Series([i for i in range(101000)]) - automl = AutoMLSearch(X_train=X, y_train=y, allowed_pipelines=[dummy_regression_pipeline_class({})], - problem_type='regression', max_time=1, max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + allowed_pipelines=[dummy_regression_pipeline_class({})], + problem_type="regression", + max_time=1, + max_iterations=1, + n_jobs=1, + ) assert isinstance(automl.data_splitter, TrainingValidationSplit) mock_score.return_value = {automl.objective.name: 0.1234} @@ -787,20 +1106,36 @@ def test_add_to_rankings_regression_large(mock_score, dummy_regression_pipeline_ def test_add_to_rankings_new_pipeline(dummy_regression_pipeline_class): - X = pd.DataFrame({'col_0': [i for i in range(100)]}) + X = pd.DataFrame({"col_0": [i for i in range(100)]}) y = pd.Series([i for i in range(100)]) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_time=1, max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + max_time=1, + max_iterations=1, + n_jobs=1, + ) test_pipeline = dummy_regression_pipeline_class(parameters={}) automl.add_to_rankings(test_pipeline) -@patch('evalml.pipelines.RegressionPipeline.score') -def test_add_to_rankings_regression(mock_score, dummy_regression_pipeline_class, X_y_regression): +@patch("evalml.pipelines.RegressionPipeline.score") +def test_add_to_rankings_regression( + mock_score, dummy_regression_pipeline_class, X_y_regression +): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, allowed_pipelines=[dummy_regression_pipeline_class({})], - problem_type='regression', max_time=1, max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + allowed_pipelines=[dummy_regression_pipeline_class({})], + problem_type="regression", + max_time=1, + max_iterations=1, + n_jobs=1, + ) mock_score.return_value = {automl.objective.name: 0.1234} automl.add_to_rankings(dummy_regression_pipeline_class({})) @@ -809,13 +1144,21 @@ def test_add_to_rankings_regression(mock_score, dummy_regression_pipeline_class, assert 0.1234 in automl.rankings["mean_cv_score"].values -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_add_to_rankings_duplicate(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_add_to_rankings_duplicate( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 0.1234} - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, allowed_pipelines=[dummy_binary_pipeline_class({})]) + mock_score.return_value = {"Log Loss Binary": 0.1234} + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) automl.search() best_pipeline = automl.best_pipeline test_pipeline = dummy_binary_pipeline_class(parameters={}) @@ -826,22 +1169,32 @@ def test_add_to_rankings_duplicate(mock_fit, mock_score, dummy_binary_pipeline_c assert automl.add_to_rankings(test_pipeline_duplicate) is None -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_add_to_rankings_trained(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_add_to_rankings_trained( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} class CoolBinaryClassificationPipeline(dummy_binary_pipeline_class): custom_name = "Cool Binary Classification Pipeline" - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({}), CoolBinaryClassificationPipeline({})]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + allowed_pipelines=[ + dummy_binary_pipeline_class({}), + CoolBinaryClassificationPipeline({}), + ], + ) automl.search() assert len(automl.rankings) == 1 assert len(automl.full_rankings) == 1 - mock_score.return_value = {'Log Loss Binary': 0.1234} + mock_score.return_value = {"Log Loss Binary": 0.1234} test_pipeline = dummy_binary_pipeline_class(parameters={}) automl.add_to_rankings(test_pipeline) assert len(automl.rankings) == 2 @@ -860,12 +1213,20 @@ class CoolBinaryClassificationPipeline(dummy_binary_pipeline_class): def test_no_search(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") assert isinstance(automl.rankings, pd.DataFrame) assert isinstance(automl.full_rankings, pd.DataFrame) - df_columns = ["id", "pipeline_name", "mean_cv_score", "standard_deviation_cv_score", - "validation_score", "percent_better_than_baseline", "high_variance_cv", "parameters"] + df_columns = [ + "id", + "pipeline_name", + "mean_cv_score", + "standard_deviation_cv_score", + "validation_score", + "percent_better_than_baseline", + "high_variance_cv", + "parameters", + ] assert (automl.rankings.columns == df_columns).all() assert (automl.full_rankings.columns == df_columns).all() @@ -879,40 +1240,48 @@ def test_no_search(X_y_binary): automl.describe_pipeline(0) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_get_pipeline_invalid(mock_fit, mock_score, X_y_binary): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') - with pytest.raises(PipelineNotFoundError, match="Pipeline not found in automl results"): + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") + with pytest.raises( + PipelineNotFoundError, match="Pipeline not found in automl results" + ): automl.get_pipeline(1000) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=1) automl.search() - assert automl.get_pipeline(0).name == 'Mode Baseline Binary Classification Pipeline' - automl._results['pipeline_results'][0].pop('pipeline_class') + assert automl.get_pipeline(0).name == "Mode Baseline Binary Classification Pipeline" + automl._results["pipeline_results"][0].pop("pipeline_class") automl._pipelines_searched.pop(0) - with pytest.raises(PipelineNotFoundError, match="Pipeline class or parameters not found in automl results"): + with pytest.raises( + PipelineNotFoundError, + match="Pipeline class or parameters not found in automl results", + ): automl.get_pipeline(0) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=1) automl.search() - assert automl.get_pipeline(0).name == 'Mode Baseline Binary Classification Pipeline' - automl._results['pipeline_results'][0].pop('parameters') - with pytest.raises(PipelineNotFoundError, match="Pipeline class or parameters not found in automl results"): + assert automl.get_pipeline(0).name == "Mode Baseline Binary Classification Pipeline" + automl._results["pipeline_results"][0].pop("parameters") + with pytest.raises( + PipelineNotFoundError, + match="Pipeline class or parameters not found in automl results", + ): automl.get_pipeline(0) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_get_pipeline(mock_fit, mock_score, X_y_binary): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=1) automl.search() for _, ranking in automl.rankings.iterrows(): pl = automl.get_pipeline(ranking.id) @@ -921,18 +1290,21 @@ def test_get_pipeline(mock_fit, mock_score, X_y_binary): assert not pl._is_fitted -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={'Log Loss Binary': 1.0}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 1.0}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") @pytest.mark.parametrize("return_dict", [True, False]) def test_describe_pipeline(mock_fit, mock_score, return_dict, caplog, X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=1) automl.search() out = caplog.text assert "Searching up to 1 pipelines. " in out - assert len(automl.results['pipeline_results']) == 1 + assert len(automl.results["pipeline_results"]) == 1 caplog.clear() automl_dict = automl.describe_pipeline(0, return_dict=return_dict) out = caplog.text @@ -950,39 +1322,83 @@ def test_describe_pipeline(mock_fit, mock_score, return_dict, caplog, X_y_binary assert "coef of var 0.000 - -" in out if return_dict: - assert automl_dict['id'] == 0 - assert automl_dict['pipeline_name'] == 'Mode Baseline Binary Classification Pipeline' - assert automl_dict['pipeline_summary'] == 'Baseline Classifier' - assert automl_dict['parameters'] == {'Baseline Classifier': {'strategy': 'mode'}} + assert automl_dict["id"] == 0 + assert ( + automl_dict["pipeline_name"] + == "Mode Baseline Binary Classification Pipeline" + ) + assert automl_dict["pipeline_summary"] == "Baseline Classifier" + assert automl_dict["parameters"] == { + "Baseline Classifier": {"strategy": "mode"} + } assert automl_dict["mean_cv_score"] == 1.0 - assert not automl_dict['high_variance_cv'] - assert isinstance(automl_dict['training_time'], float) - assert automl_dict['cv_data'] == [{'all_objective_scores': OrderedDict([('Log Loss Binary', 1.0), ('# Training', 66), ('# Validation', 34)]), "mean_cv_score": 1.0, 'binary_classification_threshold': None}, - {'all_objective_scores': OrderedDict([('Log Loss Binary', 1.0), ('# Training', 67), ('# Validation', 33)]), "mean_cv_score": 1.0, 'binary_classification_threshold': None}, - {'all_objective_scores': OrderedDict([('Log Loss Binary', 1.0), ('# Training', 67), ('# Validation', 33)]), "mean_cv_score": 1.0, 'binary_classification_threshold': None}] - assert automl_dict['percent_better_than_baseline_all_objectives'] == {'Log Loss Binary': 0} - assert automl_dict['percent_better_than_baseline'] == 0 - assert automl_dict['validation_score'] == 1.0 + assert not automl_dict["high_variance_cv"] + assert isinstance(automl_dict["training_time"], float) + assert automl_dict["cv_data"] == [ + { + "all_objective_scores": OrderedDict( + [("Log Loss Binary", 1.0), ("# Training", 66), ("# Validation", 34)] + ), + "mean_cv_score": 1.0, + "binary_classification_threshold": None, + }, + { + "all_objective_scores": OrderedDict( + [("Log Loss Binary", 1.0), ("# Training", 67), ("# Validation", 33)] + ), + "mean_cv_score": 1.0, + "binary_classification_threshold": None, + }, + { + "all_objective_scores": OrderedDict( + [("Log Loss Binary", 1.0), ("# Training", 67), ("# Validation", 33)] + ), + "mean_cv_score": 1.0, + "binary_classification_threshold": None, + }, + ] + assert automl_dict["percent_better_than_baseline_all_objectives"] == { + "Log Loss Binary": 0 + } + assert automl_dict["percent_better_than_baseline"] == 0 + assert automl_dict["validation_score"] == 1.0 else: assert automl_dict is None -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") @pytest.mark.parametrize("return_dict", [True, False]) -def test_describe_pipeline_with_ensembling(mock_pipeline_fit, mock_score, return_dict, X_y_binary, caplog): +def test_describe_pipeline_with_ensembling( + mock_pipeline_fit, mock_score, return_dict, X_y_binary, caplog +): X, y = X_y_binary two_stacking_batches = 1 + 2 * (len(get_estimators(ProblemTypes.BINARY)) + 1) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_batches=two_stacking_batches, - objective="Log Loss Binary", ensembling=True, error_callback=raise_error_callback) - - mock_score.side_effect = [{'Log Loss Binary': score} for score in np.arange(0, -1 * automl.max_iterations * automl.data_splitter.get_n_splits(), -0.1)] # Dcreases with each call + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_batches=two_stacking_batches, + objective="Log Loss Binary", + ensembling=True, + error_callback=raise_error_callback, + ) + + mock_score.side_effect = [ + {"Log Loss Binary": score} + for score in np.arange( + 0, -1 * automl.max_iterations * automl.data_splitter.get_n_splits(), -0.1 + ) + ] # Dcreases with each call automl.search() - pipeline_names = automl.rankings['pipeline_name'] - assert pipeline_names.str.contains('Ensemble').any() + pipeline_names = automl.rankings["pipeline_name"] + assert pipeline_names.str.contains("Ensemble").any() - ensemble_ids = [_get_first_stacked_classifier_no() - 1, len(automl.results['pipeline_results']) - 1] + ensemble_ids = [ + _get_first_stacked_classifier_no() - 1, + len(automl.results["pipeline_results"]) - 1, + ] for i, ensemble_id in enumerate(ensemble_ids): caplog.clear() @@ -997,52 +1413,84 @@ def test_describe_pipeline_with_ensembling(mock_pipeline_fit, mock_score, return assert "Input for ensembler are pipelines with IDs:" in out if return_dict: - assert automl_dict['id'] == ensemble_id - assert automl_dict['pipeline_name'] == "Stacked Ensemble Classification Pipeline" - assert automl_dict['pipeline_summary'] == 'Stacked Ensemble Classifier' + assert automl_dict["id"] == ensemble_id + assert ( + automl_dict["pipeline_name"] + == "Stacked Ensemble Classification Pipeline" + ) + assert automl_dict["pipeline_summary"] == "Stacked Ensemble Classifier" assert isinstance(automl_dict["mean_cv_score"], float) - assert not automl_dict['high_variance_cv'] - assert isinstance(automl_dict['training_time'], float) - assert isinstance(automl_dict['percent_better_than_baseline_all_objectives'], dict) - assert isinstance(automl_dict['percent_better_than_baseline'], float) - assert isinstance(automl_dict['validation_score'], float) - assert len(automl_dict['input_pipeline_ids']) == len(allowed_model_families("binary")) + assert not automl_dict["high_variance_cv"] + assert isinstance(automl_dict["training_time"], float) + assert isinstance( + automl_dict["percent_better_than_baseline_all_objectives"], dict + ) + assert isinstance(automl_dict["percent_better_than_baseline"], float) + assert isinstance(automl_dict["validation_score"], float) + assert len(automl_dict["input_pipeline_ids"]) == len( + allowed_model_families("binary") + ) if i == 0: - assert all(input_id < ensemble_id for input_id in automl_dict['input_pipeline_ids']) + assert all( + input_id < ensemble_id + for input_id in automl_dict["input_pipeline_ids"] + ) else: - assert all(input_id < ensemble_id for input_id in automl_dict['input_pipeline_ids']) - assert all(input_id > ensemble_ids[0] for input_id in automl_dict['input_pipeline_ids']) + assert all( + input_id < ensemble_id + for input_id in automl_dict["input_pipeline_ids"] + ) + assert all( + input_id > ensemble_ids[0] + for input_id in automl_dict["input_pipeline_ids"] + ) else: assert automl_dict is None -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_results_getter(mock_fit, mock_score, X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=1) - assert automl.results == {'pipeline_results': {}, - 'search_order': []} + assert automl.results == {"pipeline_results": {}, "search_order": []} - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} automl.search() - assert automl.results['pipeline_results'][0]["mean_cv_score"] == 1.0 + assert automl.results["pipeline_results"][0]["mean_cv_score"] == 1.0 - with pytest.raises(AttributeError, match='set attribute'): + with pytest.raises(AttributeError, match="set attribute"): automl.results = 2.0 - automl.results['pipeline_results'][0]["mean_cv_score"] = 2.0 - assert automl.results['pipeline_results'][0]["mean_cv_score"] == 1.0 + automl.results["pipeline_results"][0]["mean_cv_score"] = 2.0 + assert automl.results["pipeline_results"][0]["mean_cv_score"] == 1.0 -@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["li", "np", "pd", "ww"]) @pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -@pytest.mark.parametrize("target_type", ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool', 'category', 'object']) -def test_targets_pandas_data_types_classification(data_type, automl_type, target_type, make_data_type): - if data_type == 'np' and target_type in ['Int64', 'boolean']: - pytest.skip("Skipping test where data type is numpy and target type is nullable dtype") +@pytest.mark.parametrize( + "target_type", + [ + "int16", + "int32", + "int64", + "float16", + "float32", + "float64", + "bool", + "category", + "object", + ], +) +def test_targets_pandas_data_types_classification( + data_type, automl_type, target_type, make_data_type +): + if data_type == "np" and target_type in ["Int64", "boolean"]: + pytest.skip( + "Skipping test where data type is numpy and target type is nullable dtype" + ) if automl_type == ProblemTypes.BINARY: X, y = load_breast_cancer() @@ -1050,11 +1498,13 @@ def test_targets_pandas_data_types_classification(data_type, automl_type, target y = y.map({"malignant": False, "benign": True}) elif automl_type == ProblemTypes.MULTICLASS: if "bool" in target_type: - pytest.skip("Skipping test where problem type is multiclass but target type is boolean") + pytest.skip( + "Skipping test where problem type is multiclass but target type is boolean" + ) X, y = load_wine() unique_vals = y.unique() # Update target types as necessary - if target_type in ['category', 'object']: + if target_type in ["category", "object"]: if target_type == "category": y = pd.Series(pd.Categorical(y)) elif "int" in target_type.lower(): @@ -1063,14 +1513,16 @@ def test_targets_pandas_data_types_classification(data_type, automl_type, target y = y.map({unique_vals[i]: float(i) for i in range(len(unique_vals))}) y = y.astype(target_type) - if data_type != 'pd': + if data_type != "pd": X = make_data_type(data_type, X) y = make_data_type(data_type, y) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, max_iterations=3, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type=automl_type, max_iterations=3, n_jobs=1 + ) automl.search() - for pipeline_id, pipeline_result in automl.results['pipeline_results'].items(): - cv_data = pipeline_result['cv_data'] + for pipeline_id, pipeline_result in automl.results["pipeline_results"].items(): + cv_data = pipeline_result["cv_data"] for fold in cv_data: all_objective_scores = fold["all_objective_scores"] for score in all_objective_scores.values(): @@ -1106,75 +1558,115 @@ def __call__(self): dont_interrupt_after_bad_message = ["Yes", "yes.", "n"] -@pytest.mark.parametrize("when_to_interrupt,user_input,number_results", - [(1, interrupt, 0), - (1, interrupt_after_bad_message, 0)]) +@pytest.mark.parametrize( + "when_to_interrupt,user_input,number_results", + [(1, interrupt, 0), (1, interrupt_after_bad_message, 0)], +) @patch("builtins.input") -@patch('evalml.automl.engine.sequential_engine.SequentialComputation.get_result') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"F1": 1.0}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_catch_keyboard_interrupt_baseline(mock_fit, mock_score, mock_future_get_result, mock_input, - when_to_interrupt, user_input, number_results, - X_y_binary): +@patch("evalml.automl.engine.sequential_engine.SequentialComputation.get_result") +@patch("evalml.pipelines.BinaryClassificationPipeline.score", return_value={"F1": 1.0}) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_catch_keyboard_interrupt_baseline( + mock_fit, + mock_score, + mock_future_get_result, + mock_input, + when_to_interrupt, + user_input, + number_results, + X_y_binary, +): X, y = X_y_binary mock_input.side_effect = user_input - mock_future_get_result.side_effect = KeyboardInterruptOnKthPipeline(k=when_to_interrupt, starting_index=1) + mock_future_get_result.side_effect = KeyboardInterruptOnKthPipeline( + k=when_to_interrupt, starting_index=1 + ) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=5, - objective="f1") + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_iterations=5, objective="f1" + ) automl.search() - assert len(automl._results['pipeline_results']) == number_results + assert len(automl._results["pipeline_results"]) == number_results if number_results == 0: with pytest.raises(PipelineNotFoundError): _ = automl.best_pipeline -@pytest.mark.parametrize("when_to_interrupt,user_input,number_results", - [(1, dont_interrupt, 5), - (1, dont_interrupt_after_bad_message, 5), - (2, interrupt, 1), - (2, interrupt_after_bad_message, 1), - (2, dont_interrupt, 5), - (2, dont_interrupt_after_bad_message, 5), - (3, interrupt, 2), - (3, interrupt_after_bad_message, 2), - (3, dont_interrupt, 5), - (3, dont_interrupt_after_bad_message, 5), - (5, interrupt, 4), - (5, interrupt_after_bad_message, 4), - (5, dont_interrupt, 5), - (5, dont_interrupt_after_bad_message, 5)]) +@pytest.mark.parametrize( + "when_to_interrupt,user_input,number_results", + [ + (1, dont_interrupt, 5), + (1, dont_interrupt_after_bad_message, 5), + (2, interrupt, 1), + (2, interrupt_after_bad_message, 1), + (2, dont_interrupt, 5), + (2, dont_interrupt_after_bad_message, 5), + (3, interrupt, 2), + (3, interrupt_after_bad_message, 2), + (3, dont_interrupt, 5), + (3, dont_interrupt_after_bad_message, 5), + (5, interrupt, 4), + (5, interrupt_after_bad_message, 4), + (5, dont_interrupt, 5), + (5, dont_interrupt_after_bad_message, 5), + ], +) @patch("builtins.input") -@patch('evalml.automl.engine.sequential_engine.SequentialComputation.done') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"F1": 1.0}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_catch_keyboard_interrupt(mock_fit, mock_score, mock_future_get_result, mock_input, - when_to_interrupt, user_input, number_results, - X_y_binary): +@patch("evalml.automl.engine.sequential_engine.SequentialComputation.done") +@patch("evalml.pipelines.BinaryClassificationPipeline.score", return_value={"F1": 1.0}) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_catch_keyboard_interrupt( + mock_fit, + mock_score, + mock_future_get_result, + mock_input, + when_to_interrupt, + user_input, + number_results, + X_y_binary, +): X, y = X_y_binary mock_input.side_effect = user_input - mock_future_get_result.side_effect = KeyboardInterruptOnKthPipeline(k=when_to_interrupt, starting_index=2) - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=5, - objective="f1", optimize_thresholds=False) + mock_future_get_result.side_effect = KeyboardInterruptOnKthPipeline( + k=when_to_interrupt, starting_index=2 + ) + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=5, + objective="f1", + optimize_thresholds=False, + ) automl.search() - assert len(automl._results['pipeline_results']) == number_results + assert len(automl._results["pipeline_results"]) == number_results @patch("builtins.input", return_value="Y") -@patch('evalml.automl.engine.sequential_engine.SequentialComputation.done', - side_effect=KeyboardInterruptOnKthPipeline(k=4, starting_index=2)) -@patch('evalml.automl.engine.sequential_engine.SequentialComputation.cancel') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"F1": 1.0}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_jobs_cancelled_when_keyboard_interrupt(mock_fit, mock_score, mock_cancel, mock_done, mock_input, X_y_binary): +@patch( + "evalml.automl.engine.sequential_engine.SequentialComputation.done", + side_effect=KeyboardInterruptOnKthPipeline(k=4, starting_index=2), +) +@patch("evalml.automl.engine.sequential_engine.SequentialComputation.cancel") +@patch("evalml.pipelines.BinaryClassificationPipeline.score", return_value={"F1": 1.0}) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_jobs_cancelled_when_keyboard_interrupt( + mock_fit, mock_score, mock_cancel, mock_done, mock_input, X_y_binary +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=6, - objective="f1", optimize_thresholds=False) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=6, + objective="f1", + optimize_thresholds=False, + ) automl.search() - assert len(automl._results['pipeline_results']) == 3 + assert len(automl._results["pipeline_results"]) == 3 # Since we trigger KeyBoardInterrupt the 4th time we call done, we've successfully evaluated the baseline plus 2 # pipelines in the first batch. Since there are len(automl.allowed_pipelines) pipelines in the first batch, @@ -1183,65 +1675,140 @@ def test_jobs_cancelled_when_keyboard_interrupt(mock_fit, mock_score, mock_cance def make_mock_rankings(scores): - df = pd.DataFrame({'id': range(len(scores)), "mean_cv_score": scores, - 'pipeline_name': [f'Mock name {i}' for i in range(len(scores))]}) + df = pd.DataFrame( + { + "id": range(len(scores)), + "mean_cv_score": scores, + "pipeline_name": [f"Mock name {i}" for i in range(len(scores))], + } + ) return df -@patch('evalml.automl.automl_algorithm.IterativeAlgorithm.next_batch') -@patch('evalml.automl.AutoMLSearch.full_rankings', new_callable=PropertyMock) -@patch('evalml.automl.AutoMLSearch.rankings', new_callable=PropertyMock) -def test_pipelines_in_batch_return_nan(mock_rankings, mock_full_rankings, mock_next_batch, X_y_binary, dummy_binary_pipeline_class): - X, y = X_y_binary - mock_rankings.side_effect = [make_mock_rankings([0, 0, 0]), # first batch - make_mock_rankings([0, 0, 0, 0, np.nan]), # second batch - make_mock_rankings([0, 0, 0, 0, np.nan, np.nan, np.nan])] # third batch, should raise error - mock_full_rankings.side_effect = [make_mock_rankings([0, 0, 0]), # first batch - make_mock_rankings([0, 0, 0, 0, np.nan]), # second batch - make_mock_rankings([0, 0, 0, 0, np.nan, np.nan, np.nan])] # third batch, should raise error - mock_next_batch.side_effect = [[dummy_binary_pipeline_class(parameters={}), dummy_binary_pipeline_class(parameters={})] for i in range(3)] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=3, allowed_pipelines=[dummy_binary_pipeline_class({})], n_jobs=1) - with pytest.raises(AutoMLSearchException, match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective"): +@patch("evalml.automl.automl_algorithm.IterativeAlgorithm.next_batch") +@patch("evalml.automl.AutoMLSearch.full_rankings", new_callable=PropertyMock) +@patch("evalml.automl.AutoMLSearch.rankings", new_callable=PropertyMock) +def test_pipelines_in_batch_return_nan( + mock_rankings, + mock_full_rankings, + mock_next_batch, + X_y_binary, + dummy_binary_pipeline_class, +): + X, y = X_y_binary + mock_rankings.side_effect = [ + make_mock_rankings([0, 0, 0]), # first batch + make_mock_rankings([0, 0, 0, 0, np.nan]), # second batch + make_mock_rankings([0, 0, 0, 0, np.nan, np.nan, np.nan]), + ] # third batch, should raise error + mock_full_rankings.side_effect = [ + make_mock_rankings([0, 0, 0]), # first batch + make_mock_rankings([0, 0, 0, 0, np.nan]), # second batch + make_mock_rankings([0, 0, 0, 0, np.nan, np.nan, np.nan]), + ] # third batch, should raise error + mock_next_batch.side_effect = [ + [ + dummy_binary_pipeline_class(parameters={}), + dummy_binary_pipeline_class(parameters={}), + ] + for i in range(3) + ] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_batches=3, + allowed_pipelines=[dummy_binary_pipeline_class({})], + n_jobs=1, + ) + with pytest.raises( + AutoMLSearchException, + match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective", + ): automl.search() -@patch('evalml.automl.automl_algorithm.IterativeAlgorithm.next_batch') -@patch('evalml.automl.AutoMLSearch.full_rankings', new_callable=PropertyMock) -@patch('evalml.automl.AutoMLSearch.rankings', new_callable=PropertyMock) -def test_pipelines_in_batch_return_none(mock_rankings, mock_full_rankings, mock_next_batch, X_y_binary, dummy_binary_pipeline_class): - X, y = X_y_binary - mock_rankings.side_effect = [make_mock_rankings([0, 0, 0]), # first batch - make_mock_rankings([0, 0, 0, 0, None]), # second batch - make_mock_rankings([0, 0, 0, 0, None, None, None])] # third batch, should raise error - mock_full_rankings.side_effect = [make_mock_rankings([0, 0, 0]), # first batch - make_mock_rankings([0, 0, 0, 0, None]), # second batch - make_mock_rankings([0, 0, 0, 0, None, None, None])] # third batch, should raise error - mock_next_batch.side_effect = [[dummy_binary_pipeline_class(parameters={}), dummy_binary_pipeline_class(parameters={})] for i in range(3)] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=3, allowed_pipelines=[dummy_binary_pipeline_class({})], n_jobs=1) - with pytest.raises(AutoMLSearchException, match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective"): +@patch("evalml.automl.automl_algorithm.IterativeAlgorithm.next_batch") +@patch("evalml.automl.AutoMLSearch.full_rankings", new_callable=PropertyMock) +@patch("evalml.automl.AutoMLSearch.rankings", new_callable=PropertyMock) +def test_pipelines_in_batch_return_none( + mock_rankings, + mock_full_rankings, + mock_next_batch, + X_y_binary, + dummy_binary_pipeline_class, +): + X, y = X_y_binary + mock_rankings.side_effect = [ + make_mock_rankings([0, 0, 0]), # first batch + make_mock_rankings([0, 0, 0, 0, None]), # second batch + make_mock_rankings([0, 0, 0, 0, None, None, None]), + ] # third batch, should raise error + mock_full_rankings.side_effect = [ + make_mock_rankings([0, 0, 0]), # first batch + make_mock_rankings([0, 0, 0, 0, None]), # second batch + make_mock_rankings([0, 0, 0, 0, None, None, None]), + ] # third batch, should raise error + mock_next_batch.side_effect = [ + [ + dummy_binary_pipeline_class(parameters={}), + dummy_binary_pipeline_class(parameters={}), + ] + for i in range(3) + ] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_batches=3, + allowed_pipelines=[dummy_binary_pipeline_class({})], + n_jobs=1, + ) + with pytest.raises( + AutoMLSearchException, + match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective", + ): automl.search() -@patch('evalml.automl.engine.engine_base.split_data') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_error_during_train_test_split(mock_fit, mock_score, mock_split_data, X_y_binary): +@patch("evalml.automl.engine.engine_base.split_data") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_error_during_train_test_split( + mock_fit, mock_score, mock_split_data, X_y_binary +): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} # this method is called during pipeline eval for binary classification and will cause scores to be set to nan mock_split_data.side_effect = RuntimeError() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='Accuracy Binary', max_iterations=2, optimize_thresholds=True, train_best_pipeline=False) - with pytest.raises(AutoMLSearchException, match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective"): + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="Accuracy Binary", + max_iterations=2, + optimize_thresholds=True, + train_best_pipeline=False, + ) + with pytest.raises( + AutoMLSearchException, + match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective", + ): automl.search() - for pipeline in automl.results['pipeline_results'].values(): + for pipeline in automl.results["pipeline_results"].values(): assert np.isnan(pipeline["mean_cv_score"]) -all_objectives = get_core_objectives("binary") + get_core_objectives("multiclass") + get_core_objectives("regression") +all_objectives = ( + get_core_objectives("binary") + + get_core_objectives("multiclass") + + get_core_objectives("regression") +) class CustomClassificationObjective(BinaryClassificationObjective): """Accuracy score for binary and multiclass classification.""" + name = "Classification Accuracy" greater_is_better = True score_needs_proba = False @@ -1255,6 +1822,7 @@ def objective_function(self, y_true, y_predicted, X=None): class CustomRegressionObjective(RegressionObjective): """Accuracy score for binary and multiclass classification.""" + name = "Custom Regression Objective" greater_is_better = True score_needs_proba = False @@ -1266,31 +1834,49 @@ def objective_function(self, y_true, y_predicted, X=None): """Not implementing since mocked in our tests.""" -@pytest.mark.parametrize("objective,pipeline_scores,baseline_score,problem_type_value", - product(all_objectives + [CostBenefitMatrix, CustomClassificationObjective()], - [(0.3, 0.4), (np.nan, 0.4), (0.3, np.nan), (np.nan, np.nan)], - [0.1, np.nan], - [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION])) -def test_percent_better_than_baseline_in_rankings(objective, pipeline_scores, baseline_score, problem_type_value, - dummy_binary_pipeline_class, dummy_multiclass_pipeline_class, - dummy_regression_pipeline_class, - dummy_time_series_regression_pipeline_class, - X_y_binary): +@pytest.mark.parametrize( + "objective,pipeline_scores,baseline_score,problem_type_value", + product( + all_objectives + [CostBenefitMatrix, CustomClassificationObjective()], + [(0.3, 0.4), (np.nan, 0.4), (0.3, np.nan), (np.nan, np.nan)], + [0.1, np.nan], + [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ], + ), +) +def test_percent_better_than_baseline_in_rankings( + objective, + pipeline_scores, + baseline_score, + problem_type_value, + dummy_binary_pipeline_class, + dummy_multiclass_pipeline_class, + dummy_regression_pipeline_class, + dummy_time_series_regression_pipeline_class, + X_y_binary, +): if not objective.is_defined_for_problem_type(problem_type_value): pytest.skip("Skipping because objective is not defined for problem type") # Ok to only use binary labels since score and fit methods are mocked X, y = X_y_binary - pipeline_class = {ProblemTypes.BINARY: dummy_binary_pipeline_class, - ProblemTypes.MULTICLASS: dummy_multiclass_pipeline_class, - ProblemTypes.REGRESSION: dummy_regression_pipeline_class, - ProblemTypes.TIME_SERIES_REGRESSION: dummy_time_series_regression_pipeline_class}[problem_type_value] - baseline_pipeline_class = {ProblemTypes.BINARY: "evalml.pipelines.BinaryClassificationPipeline", - ProblemTypes.MULTICLASS: "evalml.pipelines.MulticlassClassificationPipeline", - ProblemTypes.REGRESSION: "evalml.pipelines.RegressionPipeline", - ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline" - }[problem_type_value] + pipeline_class = { + ProblemTypes.BINARY: dummy_binary_pipeline_class, + ProblemTypes.MULTICLASS: dummy_multiclass_pipeline_class, + ProblemTypes.REGRESSION: dummy_regression_pipeline_class, + ProblemTypes.TIME_SERIES_REGRESSION: dummy_time_series_regression_pipeline_class, + }[problem_type_value] + baseline_pipeline_class = { + ProblemTypes.BINARY: "evalml.pipelines.BinaryClassificationPipeline", + ProblemTypes.MULTICLASS: "evalml.pipelines.MulticlassClassificationPipeline", + ProblemTypes.REGRESSION: "evalml.pipelines.RegressionPipeline", + ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline", + }[problem_type_value] class DummyPipeline(pipeline_class): problem_type = problem_type_value @@ -1319,63 +1905,128 @@ class Pipeline2(DummyPipeline): Pipeline2.score = mock_score_2 if objective.name.lower() == "cost benefit matrix": - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_pipelines=[Pipeline1({}), Pipeline2({})], objective=objective(0, 0, 0, 0), - additional_objectives=[], optimize_thresholds=False, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type_value, + max_iterations=3, + allowed_pipelines=[Pipeline1({}), Pipeline2({})], + objective=objective(0, 0, 0, 0), + additional_objectives=[], + optimize_thresholds=False, + n_jobs=1, + ) elif problem_type_value == ProblemTypes.TIME_SERIES_REGRESSION: - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_pipelines=[Pipeline1({'pipeline': {'date_index': None, 'gap': 0, 'max_delay': 0}}), Pipeline2({'pipeline': {'date_index': None, 'gap': 0, 'max_delay': 0}})], objective=objective, - additional_objectives=[], problem_configuration={'date_index': None, 'gap': 0, 'max_delay': 0}, train_best_pipeline=False, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type_value, + max_iterations=3, + allowed_pipelines=[ + Pipeline1({"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}}), + Pipeline2({"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}}), + ], + objective=objective, + additional_objectives=[], + problem_configuration={"date_index": None, "gap": 0, "max_delay": 0}, + train_best_pipeline=False, + n_jobs=1, + ) else: - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_pipelines=[Pipeline1({}), Pipeline2({})], objective=objective, - additional_objectives=[], optimize_thresholds=False, n_jobs=1) - - with patch(baseline_pipeline_class + ".score", return_value={objective.name: baseline_score}): + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type_value, + max_iterations=3, + allowed_pipelines=[Pipeline1({}), Pipeline2({})], + objective=objective, + additional_objectives=[], + optimize_thresholds=False, + n_jobs=1, + ) + + with patch( + baseline_pipeline_class + ".score", + return_value={objective.name: baseline_score}, + ): if np.isnan(pipeline_scores).all(): - with pytest.raises(AutoMLSearchException, match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective"): + with pytest.raises( + AutoMLSearchException, + match="All pipelines in the current AutoML batch produced a score of np.nan on the primary objective", + ): automl.search() else: automl.search() - scores = dict(zip(automl.rankings.pipeline_name, automl.rankings.percent_better_than_baseline)) - baseline_name = next(name for name in automl.rankings.pipeline_name if name not in {"Pipeline1", "Pipeline2"}) - answers = {"Pipeline1": round(objective.calculate_percent_difference(pipeline_scores[0], baseline_score), 2), - "Pipeline2": round(objective.calculate_percent_difference(pipeline_scores[1], baseline_score), 2), - baseline_name: round(objective.calculate_percent_difference(baseline_score, baseline_score), 2)} + scores = dict( + zip( + automl.rankings.pipeline_name, + automl.rankings.percent_better_than_baseline, + ) + ) + baseline_name = next( + name + for name in automl.rankings.pipeline_name + if name not in {"Pipeline1", "Pipeline2"} + ) + answers = { + "Pipeline1": round( + objective.calculate_percent_difference( + pipeline_scores[0], baseline_score + ), + 2, + ), + "Pipeline2": round( + objective.calculate_percent_difference( + pipeline_scores[1], baseline_score + ), + 2, + ), + baseline_name: round( + objective.calculate_percent_difference(baseline_score, baseline_score), + 2, + ), + } for name in answers: np.testing.assert_almost_equal(scores[name], answers[name], decimal=3) @pytest.mark.parametrize("custom_additional_objective", [True, False]) -@pytest.mark.parametrize("problem_type", ["binary", "multiclass", "regression", "time series regression"]) +@pytest.mark.parametrize( + "problem_type", ["binary", "multiclass", "regression", "time series regression"] +) @patch("evalml.pipelines.BinaryClassificationPipeline.fit") @patch("evalml.pipelines.MulticlassClassificationPipeline.fit") @patch("evalml.pipelines.RegressionPipeline.fit") @patch("evalml.pipelines.TimeSeriesRegressionPipeline.fit") -def test_percent_better_than_baseline_computed_for_all_objectives(mock_time_series_baseline_regression_fit, - mock_regression_fit, - mock_multiclass_fit, - mock_binary_fit, - problem_type, - custom_additional_objective, - dummy_binary_pipeline_class, - dummy_multiclass_pipeline_class, - dummy_regression_pipeline_class, - dummy_time_series_regression_pipeline_class, - X_y_binary): +def test_percent_better_than_baseline_computed_for_all_objectives( + mock_time_series_baseline_regression_fit, + mock_regression_fit, + mock_multiclass_fit, + mock_binary_fit, + problem_type, + custom_additional_objective, + dummy_binary_pipeline_class, + dummy_multiclass_pipeline_class, + dummy_regression_pipeline_class, + dummy_time_series_regression_pipeline_class, + X_y_binary, +): X, y = X_y_binary problem_type_enum = handle_problem_types(problem_type) - pipeline_class = {"binary": dummy_binary_pipeline_class, - "multiclass": dummy_multiclass_pipeline_class, - "regression": dummy_regression_pipeline_class, - "time series regression": dummy_time_series_regression_pipeline_class}[problem_type] - baseline_pipeline_class = {ProblemTypes.BINARY: "evalml.pipelines.BinaryClassificationPipeline", - ProblemTypes.MULTICLASS: "evalml.pipelines.MulticlassClassificationPipeline", - ProblemTypes.REGRESSION: "evalml.pipelines.RegressionPipeline", - ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline" - }[problem_type_enum] + pipeline_class = { + "binary": dummy_binary_pipeline_class, + "multiclass": dummy_multiclass_pipeline_class, + "regression": dummy_regression_pipeline_class, + "time series regression": dummy_time_series_regression_pipeline_class, + }[problem_type] + baseline_pipeline_class = { + ProblemTypes.BINARY: "evalml.pipelines.BinaryClassificationPipeline", + ProblemTypes.MULTICLASS: "evalml.pipelines.MulticlassClassificationPipeline", + ProblemTypes.REGRESSION: "evalml.pipelines.RegressionPipeline", + ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline", + }[problem_type_enum] class DummyPipeline(pipeline_class): name = "Dummy 1" @@ -1398,51 +2049,76 @@ def fit(self, *args, **kwargs): if CustomClassificationObjective.is_defined_for_problem_type(problem_type_enum): additional_objectives = [CustomClassificationObjective()] else: - additional_objectives = [CustomRegressionObjective(), "Root Mean Squared Error"] + additional_objectives = [ + CustomRegressionObjective(), + "Root Mean Squared Error", + ] core_objectives = get_core_objectives(problem_type) if additional_objectives: - core_objectives = [get_default_primary_search_objective(problem_type_enum)] + additional_objectives + core_objectives = [ + get_default_primary_search_objective(problem_type_enum) + ] + additional_objectives mock_scores = {get_objective(obj).name: i for i, obj in enumerate(core_objectives)} - mock_baseline_scores = {get_objective(obj).name: i + 1 for i, obj in enumerate(core_objectives)} + mock_baseline_scores = { + get_objective(obj).name: i + 1 for i, obj in enumerate(core_objectives) + } answer = {} baseline_percent_difference = {} for obj in core_objectives: obj_class = get_objective(obj) - answer[obj_class.name] = obj_class.calculate_percent_difference(mock_scores[obj_class.name], - mock_baseline_scores[obj_class.name]) + answer[obj_class.name] = obj_class.calculate_percent_difference( + mock_scores[obj_class.name], mock_baseline_scores[obj_class.name] + ) baseline_percent_difference[obj_class.name] = 0 mock_score_1 = MagicMock(return_value=mock_scores) DummyPipeline.score = mock_score_1 parameters = {} if problem_type_enum == ProblemTypes.TIME_SERIES_REGRESSION: - parameters = {"pipeline": {'date_index': None, "gap": 6, "max_delay": 3}} + parameters = {"pipeline": {"date_index": None, "gap": 6, "max_delay": 3}} # specifying problem_configuration for all problem types for conciseness - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, max_iterations=2, - allowed_pipelines=[DummyPipeline(parameters)], - objective="auto", problem_configuration={'date_index': None, 'gap': 1, 'max_delay': 1}, - additional_objectives=additional_objectives) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + max_iterations=2, + allowed_pipelines=[DummyPipeline(parameters)], + objective="auto", + problem_configuration={"date_index": None, "gap": 1, "max_delay": 1}, + additional_objectives=additional_objectives, + ) with patch(baseline_pipeline_class + ".score", return_value=mock_baseline_scores): automl.search() - assert len(automl.results['pipeline_results']) == 2, "This tests assumes only one non-baseline pipeline was run!" - pipeline_results = automl.results['pipeline_results'][1] - baseline_results = automl.results['pipeline_results'][0] + assert ( + len(automl.results["pipeline_results"]) == 2 + ), "This tests assumes only one non-baseline pipeline was run!" + pipeline_results = automl.results["pipeline_results"][1] + baseline_results = automl.results["pipeline_results"][0] assert pipeline_results["percent_better_than_baseline_all_objectives"] == answer - assert pipeline_results['percent_better_than_baseline'] == pipeline_results["percent_better_than_baseline_all_objectives"][automl.objective.name] + assert ( + pipeline_results["percent_better_than_baseline"] + == pipeline_results["percent_better_than_baseline_all_objectives"][ + automl.objective.name + ] + ) # Check that baseline is 0% better than baseline - assert baseline_results["percent_better_than_baseline_all_objectives"] == baseline_percent_difference + assert ( + baseline_results["percent_better_than_baseline_all_objectives"] + == baseline_percent_difference + ) @pytest.mark.parametrize("fold_scores", [[2, 4, 6], [np.nan, 4, 6]]) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={'Log Loss Binary': 1, 'F1': 1}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_percent_better_than_baseline_scores_different_folds(mock_fit, - mock_score, - fold_scores, - dummy_binary_pipeline_class, - X_y_binary): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 1, "F1": 1}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_percent_better_than_baseline_scores_different_folds( + mock_fit, mock_score, fold_scores, dummy_binary_pipeline_class, X_y_binary +): # Test that percent-better-than-baseline is correctly computed when scores differ across folds X, y = X_y_binary @@ -1459,7 +2135,9 @@ def new(self, parameters, random_seed=0): def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) - mock_score = MagicMock(side_effect=[{"Log Loss Binary": 1, "F1": val} for val in fold_scores]) + mock_score = MagicMock( + side_effect=[{"Log Loss Binary": 1, "F1": val} for val in fold_scores] + ) DummyPipeline.score = mock_score f1 = get_objective("f1")() @@ -1468,63 +2146,125 @@ def clone(self): else: answer = f1.calculate_percent_difference(4, 1) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=2, - allowed_pipelines=[DummyPipeline({})], objective="log loss binary", additional_objectives=["f1"]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=2, + allowed_pipelines=[DummyPipeline({})], + objective="log loss binary", + additional_objectives=["f1"], + ) automl.search() - assert len(automl.results['pipeline_results']) == 2, "This tests assumes only one non-baseline pipeline was run!" - pipeline_results = automl.results['pipeline_results'][1] - np.testing.assert_equal(pipeline_results["percent_better_than_baseline_all_objectives"]['F1'], answer) + assert ( + len(automl.results["pipeline_results"]) == 2 + ), "This tests assumes only one non-baseline pipeline was run!" + pipeline_results = automl.results["pipeline_results"][1] + np.testing.assert_equal( + pipeline_results["percent_better_than_baseline_all_objectives"]["F1"], answer + ) def _get_first_stacked_classifier_no(model_families=None): """Gets the number of iterations necessary before the stacked ensemble will be used.""" - num_classifiers = len(get_estimators(ProblemTypes.BINARY, model_families=model_families)) + num_classifiers = len( + get_estimators(ProblemTypes.BINARY, model_families=model_families) + ) # Baseline + first batch + each pipeline iteration (5 is current default pipelines_per_batch) + 1 return 1 + num_classifiers + num_classifiers * 5 + 1 -@pytest.mark.parametrize("max_iterations", [None, 1, 8, 10, _get_first_stacked_classifier_no(), _get_first_stacked_classifier_no() + 2]) +@pytest.mark.parametrize( + "max_iterations", + [ + None, + 1, + 8, + 10, + _get_first_stacked_classifier_no(), + _get_first_stacked_classifier_no() + 2, + ], +) @pytest.mark.parametrize("use_ensembling", [True, False]) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_max_iteration_works_with_stacked_ensemble(mock_pipeline_fit, mock_score, max_iterations, use_ensembling, X_y_binary, caplog): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_max_iteration_works_with_stacked_ensemble( + mock_pipeline_fit, mock_score, max_iterations, use_ensembling, X_y_binary, caplog +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=max_iterations, objective="Log Loss Binary", ensembling=use_ensembling) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=max_iterations, + objective="Log Loss Binary", + ensembling=use_ensembling, + ) automl.search() # every nth batch a stacked ensemble will be trained if max_iterations is None: max_iterations = 5 # Default value for max_iterations - pipeline_names = automl.rankings['pipeline_name'] + pipeline_names = automl.rankings["pipeline_name"] if max_iterations < _get_first_stacked_classifier_no(): - assert not pipeline_names.str.contains('Ensemble').any() + assert not pipeline_names.str.contains("Ensemble").any() elif use_ensembling: - assert pipeline_names.str.contains('Ensemble').any() - assert f"Ensembling will run at the {_get_first_stacked_classifier_no()} iteration" in caplog.text + assert pipeline_names.str.contains("Ensemble").any() + assert ( + f"Ensembling will run at the {_get_first_stacked_classifier_no()} iteration" + in caplog.text + ) else: - assert not pipeline_names.str.contains('Ensemble').any() + assert not pipeline_names.str.contains("Ensemble").any() @pytest.mark.parametrize("max_batches", [None, 1, 5, 8, 9, 10, 12, 20]) @pytest.mark.parametrize("use_ensembling", [True, False]) @pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.REGRESSION]) -@patch('evalml.pipelines.RegressionPipeline.score', return_value={"R2": 0.8}) -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_max_batches_works(mock_pipeline_fit, mock_score, mock_regression_fit, mock_regression_score, - max_batches, use_ensembling, problem_type, X_y_binary, X_y_regression): +@patch("evalml.pipelines.RegressionPipeline.score", return_value={"R2": 0.8}) +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_max_batches_works( + mock_pipeline_fit, + mock_score, + mock_regression_fit, + mock_regression_score, + max_batches, + use_ensembling, + problem_type, + X_y_binary, + X_y_regression, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=None, - max_batches=max_batches, ensembling=use_ensembling) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=None, + max_batches=max_batches, + ensembling=use_ensembling, + ) elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="regression", max_iterations=None, - max_batches=max_batches, ensembling=use_ensembling) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + max_iterations=None, + max_batches=max_batches, + ensembling=use_ensembling, + ) automl.search() # every nth batch a stacked ensemble will be trained @@ -1540,43 +2280,86 @@ def test_max_batches_works(mock_pipeline_fit, mock_score, mock_regression_fit, m num_ensemble_batches = 0 else: # automl algorithm does not know about the additional stacked ensemble pipelines - num_ensemble_batches = (max_batches - 1) // ensemble_nth_batch if use_ensembling else 0 + num_ensemble_batches = ( + (max_batches - 1) // ensemble_nth_batch if use_ensembling else 0 + ) # So that the test does not break when new estimator classes are added - n_results = 1 + len(automl.allowed_pipelines) + (5 * (max_batches - 1 - num_ensemble_batches)) + num_ensemble_batches + n_results = ( + 1 + + len(automl.allowed_pipelines) + + (5 * (max_batches - 1 - num_ensemble_batches)) + + num_ensemble_batches + ) n_automl_pipelines = n_results assert automl._automl_algorithm.batch_number == max_batches assert automl._automl_algorithm.pipeline_number + 1 == n_automl_pipelines assert len(automl.results["pipeline_results"]) == n_results if num_ensemble_batches == 0: - assert automl.rankings.shape[0] == min(1 + len(automl.allowed_pipelines), n_results) # add one for baseline + assert automl.rankings.shape[0] == min( + 1 + len(automl.allowed_pipelines), n_results + ) # add one for baseline else: - assert automl.rankings.shape[0] == min(2 + len(automl.allowed_pipelines), n_results) # add two for baseline and stacked ensemble + assert automl.rankings.shape[0] == min( + 2 + len(automl.allowed_pipelines), n_results + ) # add two for baseline and stacked ensemble assert automl.full_rankings.shape[0] == n_results def test_early_stopping_negative(X_y_binary): X, y = X_y_binary - with pytest.raises(ValueError, match='patience value must be a positive integer.'): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=5, allowed_model_families=['linear_model'], patience=-1, random_seed=0) - with pytest.raises(ValueError, match='tolerance value must be'): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=5, allowed_model_families=['linear_model'], patience=1, tolerance=1.5, random_seed=0) + with pytest.raises(ValueError, match="patience value must be a positive integer."): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="AUC", + max_iterations=5, + allowed_model_families=["linear_model"], + patience=-1, + random_seed=0, + ) + with pytest.raises(ValueError, match="tolerance value must be"): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="AUC", + max_iterations=5, + allowed_model_families=["linear_model"], + patience=1, + tolerance=1.5, + random_seed=0, + ) def test_early_stopping(caplog, logistic_regression_binary_pipeline_class, X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=5, - allowed_model_families=['linear_model'], patience=2, tolerance=0.05, - random_seed=0, n_jobs=1) - mock_results = { - 'search_order': [0, 1, 2, 3], - 'pipeline_results': {} - } - - scores = [0.84, 0.95, 0.84, 0.96] # 0.96 is only 1% greater so it doesn't trigger patience due to tolerance - for id in mock_results['search_order']: - mock_results['pipeline_results'][id] = {} - mock_results['pipeline_results'][id]["mean_cv_score"] = scores[id] - mock_results['pipeline_results'][id]['pipeline_class'] = logistic_regression_binary_pipeline_class + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="AUC", + max_iterations=5, + allowed_model_families=["linear_model"], + patience=2, + tolerance=0.05, + random_seed=0, + n_jobs=1, + ) + mock_results = {"search_order": [0, 1, 2, 3], "pipeline_results": {}} + + scores = [ + 0.84, + 0.95, + 0.84, + 0.96, + ] # 0.96 is only 1% greater so it doesn't trigger patience due to tolerance + for id in mock_results["search_order"]: + mock_results["pipeline_results"][id] = {} + mock_results["pipeline_results"][id]["mean_cv_score"] = scores[id] + mock_results["pipeline_results"][id][ + "pipeline_class" + ] = logistic_regression_binary_pipeline_class automl._results = mock_results assert not automl._should_continue() @@ -1584,86 +2367,184 @@ def test_early_stopping(caplog, logistic_regression_binary_pipeline_class, X_y_b assert "2 iterations without improvement. Stopping search early." in out -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_one_allowed_pipeline_ensembling_disabled(mock_pipeline_fit, mock_score, X_y_binary, logistic_regression_binary_pipeline_class, caplog): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_one_allowed_pipeline_ensembling_disabled( + mock_pipeline_fit, + mock_score, + X_y_binary, + logistic_regression_binary_pipeline_class, + caplog, +): max_iterations = _get_first_stacked_classifier_no([ModelFamily.RANDOM_FOREST]) + 1 # Checks that when len(allowed_pipeline) == 1, ensembling is not run, even if set to True X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=max_iterations, allowed_model_families=[ModelFamily.RANDOM_FOREST], ensembling=True) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=max_iterations, + allowed_model_families=[ModelFamily.RANDOM_FOREST], + ensembling=True, + ) automl.search() - assert "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." in caplog.text + assert ( + "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." + in caplog.text + ) - pipeline_names = automl.rankings['pipeline_name'] - assert not pipeline_names.str.contains('Ensemble').any() + pipeline_names = automl.rankings["pipeline_name"] + assert not pipeline_names.str.contains("Ensemble").any() caplog.clear() max_iterations = _get_first_stacked_classifier_no([ModelFamily.LINEAR_MODEL]) + 1 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=max_iterations, allowed_pipelines=[logistic_regression_binary_pipeline_class({})], ensembling=True) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=max_iterations, + allowed_pipelines=[logistic_regression_binary_pipeline_class({})], + ensembling=True, + ) automl.search() - pipeline_names = automl.rankings['pipeline_name'] - assert not pipeline_names.str.contains('Ensemble').any() - assert "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." in caplog.text + pipeline_names = automl.rankings["pipeline_name"] + assert not pipeline_names.str.contains("Ensemble").any() + assert ( + "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." + in caplog.text + ) # Check that ensembling runs when len(allowed_model_families) == 1 but len(allowed_pipelines) > 1 caplog.clear() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=max_iterations, allowed_model_families=[ModelFamily.LINEAR_MODEL], ensembling=True) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=max_iterations, + allowed_model_families=[ModelFamily.LINEAR_MODEL], + ensembling=True, + ) automl.search() - pipeline_names = automl.rankings['pipeline_name'] - assert pipeline_names.str.contains('Ensemble').any() - assert "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." not in caplog.text + pipeline_names = automl.rankings["pipeline_name"] + assert pipeline_names.str.contains("Ensemble").any() + assert ( + "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." + not in caplog.text + ) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_max_iterations_less_than_ensembling_disabled(mock_pipeline_fit, mock_score, X_y_binary, caplog): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_max_iterations_less_than_ensembling_disabled( + mock_pipeline_fit, mock_score, X_y_binary, caplog +): max_iterations = _get_first_stacked_classifier_no([ModelFamily.LINEAR_MODEL]) X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=max_iterations - 1, allowed_model_families=[ModelFamily.LINEAR_MODEL], ensembling=True) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=max_iterations - 1, + allowed_model_families=[ModelFamily.LINEAR_MODEL], + ensembling=True, + ) automl.search() - assert f"Ensembling is set to True, but max_iterations is too small, so ensembling will not run. Set max_iterations >= {max_iterations} to run ensembling." in caplog.text + assert ( + f"Ensembling is set to True, but max_iterations is too small, so ensembling will not run. Set max_iterations >= {max_iterations} to run ensembling." + in caplog.text + ) - pipeline_names = automl.rankings['pipeline_name'] - assert not pipeline_names.str.contains('Ensemble').any() + pipeline_names = automl.rankings["pipeline_name"] + assert not pipeline_names.str.contains("Ensemble").any() -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_max_batches_less_than_ensembling_disabled(mock_pipeline_fit, mock_score, X_y_binary, caplog): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_max_batches_less_than_ensembling_disabled( + mock_pipeline_fit, mock_score, X_y_binary, caplog +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_batches=2, allowed_model_families=[ModelFamily.LINEAR_MODEL], ensembling=True) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_batches=2, + allowed_model_families=[ModelFamily.LINEAR_MODEL], + ensembling=True, + ) automl.search() - first_ensemble_batch = 1 + len(automl.allowed_pipelines) + 1 # First batch + each pipeline batch - assert f"Ensembling is set to True, but max_batches is too small, so ensembling will not run. Set max_batches >= {first_ensemble_batch} to run ensembling." in caplog.text + first_ensemble_batch = ( + 1 + len(automl.allowed_pipelines) + 1 + ) # First batch + each pipeline batch + assert ( + f"Ensembling is set to True, but max_batches is too small, so ensembling will not run. Set max_batches >= {first_ensemble_batch} to run ensembling." + in caplog.text + ) - pipeline_names = automl.rankings['pipeline_name'] - assert not pipeline_names.str.contains('Ensemble').any() + pipeline_names = automl.rankings["pipeline_name"] + assert not pipeline_names.str.contains("Ensemble").any() @pytest.mark.parametrize("max_batches", [1, 2, 5, 10]) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_max_batches_output(mock_pipeline_fit, mock_score, max_batches, X_y_binary, caplog): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_max_batches_output( + mock_pipeline_fit, mock_score, max_batches, X_y_binary, caplog +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=None, max_batches=max_batches) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=None, + max_batches=max_batches, + ) automl.search() output = caplog.text assert output.count("Batch Number") == max_batches -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_max_batches_plays_nice_with_other_stopping_criteria(mock_fit, mock_score, X_y_binary): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_max_batches_plays_nice_with_other_stopping_criteria( + mock_fit, mock_score, X_y_binary +): X, y = X_y_binary # Use the old default when all are None - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", objective="Log Loss Binary") + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", objective="Log Loss Binary" + ) automl.search() - assert len(automl.results["pipeline_results"]) == len(get_estimators(problem_type='binary')) + 1 + assert ( + len(automl.results["pipeline_results"]) + == len(get_estimators(problem_type="binary")) + 1 + ) # Use max_iterations when both max_iterations and max_batches are set - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", objective="Log Loss Binary", max_batches=10, - max_iterations=6) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="Log Loss Binary", + max_batches=10, + max_iterations=6, + ) automl.search() assert len(automl.results["pipeline_results"]) == 6 @@ -1676,26 +2557,43 @@ def test_max_batches_plays_nice_with_other_stopping_criteria(mock_fit, mock_scor @pytest.mark.parametrize("max_batches", [-1, -10, -np.inf]) def test_max_batches_must_be_non_negative(max_batches, X_y_binary): X, y = X_y_binary - with pytest.raises(ValueError, match=f"Parameter max_batches must be None or non-negative. Received {max_batches}."): - AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_batches=max_batches) + with pytest.raises( + ValueError, + match=f"Parameter max_batches must be None or non-negative. Received {max_batches}.", + ): + AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_batches=max_batches + ) def test_stopping_criterion_bad(X_y_binary): X, y = X_y_binary - with pytest.raises(TypeError, match=r"Parameter max_time must be a float, int, string or None. Received with value \('test',\)."): - AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_time=('test',)) - with pytest.raises(ValueError, match=f"Parameter max_batches must be None or non-negative. Received -1."): + with pytest.raises( + TypeError, + match=r"Parameter max_time must be a float, int, string or None. Received with value \('test',\).", + ): + AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_time=("test",)) + with pytest.raises( + ValueError, + match=f"Parameter max_batches must be None or non-negative. Received -1.", + ): AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_batches=-1) - with pytest.raises(ValueError, match=f"Parameter max_time must be None or non-negative. Received -1."): + with pytest.raises( + ValueError, + match=f"Parameter max_time must be None or non-negative. Received -1.", + ): AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_time=-1) - with pytest.raises(ValueError, match=f"Parameter max_iterations must be None or non-negative. Received -1."): + with pytest.raises( + ValueError, + match=f"Parameter max_iterations must be None or non-negative. Received -1.", + ): AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=-1) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_data_splitter_binary(mock_fit, mock_score, X_y_binary): - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} X, y = X_y_binary y[:] = 0 y[0] = 1 @@ -1715,15 +2613,15 @@ def test_data_splitter_binary(mock_fit, mock_score, X_y_binary): automl.search() -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") def test_data_splitter_multi(mock_fit, mock_score, X_y_multi): - mock_score.return_value = {'Log Loss Multiclass': 1.0} + mock_score.return_value = {"Log Loss Multiclass": 1.0} X, y = X_y_multi y[:] = 1 y[0] = 0 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', n_jobs=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass", n_jobs=1) with pytest.raises(Exception, match="Missing target values"): with pytest.warns(UserWarning): automl.search() @@ -1732,42 +2630,50 @@ def test_data_splitter_multi(mock_fit, mock_score, X_y_multi): # match based on regex, since data split doesn't have a random seed for reproducibility # regex matches the set {} and expects either 2 sets (missing in both train and test) # or 1 set of multiple elements (both missing in train or both in test) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', n_jobs=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass", n_jobs=1) with pytest.raises(Exception, match=r"(\{\d?\}.+\{\d?\})|(\{.+\,.+\})"): with pytest.warns(UserWarning): automl.search() y[1] = 0 y[2:4] = 2 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', n_jobs=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass", n_jobs=1) with pytest.raises(Exception, match="Missing target values"): with pytest.warns(UserWarning): automl.search() y[4] = 2 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', n_jobs=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass", n_jobs=1) with pytest.raises(Exception, match="Missing target values"): with pytest.warns(UserWarning): automl.search() y[5] = 0 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', n_jobs=1) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass", n_jobs=1) automl.search() -@patch('evalml.tuners.skopt_tuner.SKOptTuner.add') -def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors(mock_add, X_y_multi): +@patch("evalml.tuners.skopt_tuner.SKOptTuner.add") +def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors( + mock_add, X_y_multi +): X, y = X_y_multi custom_hyperparameters = { - "Imputer": { - "numeric_impute_strategy": Categorical(["most_frequent", "mean"]) - } + "Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", "mean"])} } - estimators = get_estimators('multiclass', [ModelFamily.EXTRA_TREES]) - - pipelines = [make_pipeline(X, y, estimator, 'multiclass', None) for estimator in estimators] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipelines, - custom_hyperparameters=custom_hyperparameters, n_jobs=1) + estimators = get_estimators("multiclass", [ModelFamily.EXTRA_TREES]) + + pipelines = [ + make_pipeline(X, y, estimator, "multiclass", None) for estimator in estimators + ] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + allowed_pipelines=pipelines, + custom_hyperparameters=custom_hyperparameters, + n_jobs=1, + ) mock_add.side_effect = ValueError("Alternate error that can be thrown") with pytest.raises(ValueError) as error: @@ -1776,16 +2682,27 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors assert "Default parameters for components" not in str(error.value) -@pytest.mark.parametrize("pipelines,pipeline_parameters", [(True, False), (True, True), (False, False)]) +@pytest.mark.parametrize( + "pipelines,pipeline_parameters", [(True, False), (True, True), (False, False)] +) @pytest.mark.parametrize("automl_parameters", [True, False]) @pytest.mark.parametrize("custom_hyperparameters", [True, False]) -@patch('evalml.pipelines.MulticlassClassificationPipeline.score', return_value={"Log Loss Multiclass": 0.6}) -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_fit, mock_score, custom_hyperparameters, - automl_parameters, pipelines, pipeline_parameters, - X_y_multi): +@patch( + "evalml.pipelines.MulticlassClassificationPipeline.score", + return_value={"Log Loss Multiclass": 0.6}, +) +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( + mock_fit, + mock_score, + custom_hyperparameters, + automl_parameters, + pipelines, + pipeline_parameters, + X_y_multi, +): X, y = X_y_multi - X = pd.DataFrame(X, columns=[f'Column_{i}' for i in range(20)]) + X = pd.DataFrame(X, columns=[f"Column_{i}" for i in range(20)]) pipeline_parameters_ = None pipeline_ = None @@ -1794,62 +2711,113 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline(mock_ if pipeline_parameters: pipeline_parameters_ = { - "Drop Columns Transformer": {'columns': ['Column_0', 'Column_1', 'Column_2']}, - "Imputer": {'numeric_impute_strategy': 'most_frequent'}, - "Random Forest Classifier": {'n_estimators': 200, - "max_depth": 11} + "Drop Columns Transformer": { + "columns": ["Column_0", "Column_1", "Column_2"] + }, + "Imputer": {"numeric_impute_strategy": "most_frequent"}, + "Random Forest Classifier": {"n_estimators": 200, "max_depth": 11}, } if pipelines: - component_graph_ = ['Drop Columns Transformer', 'Imputer', 'Random Forest Classifier'] - pipeline_ = [MulticlassClassificationPipeline(component_graph=component_graph_, parameters=pipeline_parameters_)] + component_graph_ = [ + "Drop Columns Transformer", + "Imputer", + "Random Forest Classifier", + ] + pipeline_ = [ + MulticlassClassificationPipeline( + component_graph=component_graph_, parameters=pipeline_parameters_ + ) + ] if automl_parameters: automl_parameters_ = { - "Drop Columns Transformer": {'columns': ['Column_0', 'Column_1', 'Column_2']}, - "Random Forest Classifier": {'n_estimators': 201} + "Drop Columns Transformer": { + "columns": ["Column_0", "Column_1", "Column_2"] + }, + "Random Forest Classifier": {"n_estimators": 201}, } if custom_hyperparameters: custom_hyperparameters_ = { - "Imputer": { - "numeric_impute_strategy": Categorical(["mean"]) - }, + "Imputer": {"numeric_impute_strategy": Categorical(["mean"])}, "Random Forest Classifier": { "max_depth": Integer(4, 7), - 'n_estimators': Integer(190, 210) - } + "n_estimators": Integer(190, 210), + }, } - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=pipeline_, - pipeline_parameters=automl_parameters_, custom_hyperparameters=custom_hyperparameters_, max_batches=4) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + allowed_pipelines=pipeline_, + pipeline_parameters=automl_parameters_, + custom_hyperparameters=custom_hyperparameters_, + max_batches=4, + ) automl.search() for i, row in automl.full_rankings.iterrows(): - if "Random Forest Classifier" in row['pipeline_name']: + if "Random Forest Classifier" in row["pipeline_name"]: if pipelines and automl_parameters: - assert row["parameters"]["Drop Columns Transformer"]["columns"] == ['Column_0', 'Column_1', 'Column_2'] + assert row["parameters"]["Drop Columns Transformer"]["columns"] == [ + "Column_0", + "Column_1", + "Column_2", + ] elif pipeline_parameters: assert row["parameters"]["Drop Columns Transformer"]["columns"] is None if custom_hyperparameters_: - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in custom_hyperparameters_['Imputer']['numeric_impute_strategy'] - assert 4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7 + assert ( + row["parameters"]["Imputer"]["numeric_impute_strategy"] + in custom_hyperparameters_["Imputer"]["numeric_impute_strategy"] + ) + assert ( + 4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7 + ) if automl_parameters and row["id"] == 1: - assert row["parameters"]["Random Forest Classifier"]["n_estimators"] == 201 + assert ( + row["parameters"]["Random Forest Classifier"]["n_estimators"] + == 201 + ) else: - assert 190 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 210 + assert ( + 190 + <= row["parameters"]["Random Forest Classifier"]["n_estimators"] + <= 210 + ) else: - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in ["mean", "median", "most_frequent"] - assert 1 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 10 + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [ + "mean", + "median", + "most_frequent", + ] + assert ( + 1 + <= row["parameters"]["Random Forest Classifier"]["max_depth"] + <= 10 + ) if automl_parameters and row["id"] == 1: - assert row["parameters"]["Random Forest Classifier"]["n_estimators"] == 201 + assert ( + row["parameters"]["Random Forest Classifier"]["n_estimators"] + == 201 + ) else: - assert 10 <= row["parameters"]["Random Forest Classifier"]["n_estimators"] <= 1000 + assert ( + 10 + <= row["parameters"]["Random Forest Classifier"]["n_estimators"] + <= 1000 + ) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.6}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_iterative_algorithm_passes_njobs_to_pipelines(mock_fit, mock_score, dummy_binary_pipeline_class, - X_y_binary): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.6}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_iterative_algorithm_passes_njobs_to_pipelines( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +): X, y = X_y_binary class MockEstimatorWithNJobs(Estimator): @@ -1859,130 +2827,213 @@ class MockEstimatorWithNJobs(Estimator): hyperparameter_ranges = {} def __init__(self, n_jobs=-1, random_seed=0): - super().__init__(parameters={"n_jobs": n_jobs}, component_obj=None, random_seed=random_seed) - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', n_jobs=3, max_batches=2, - allowed_pipelines=[BinaryClassificationPipeline([MockEstimatorWithNJobs], custom_name="Pipeline 1"), - BinaryClassificationPipeline([MockEstimatorWithNJobs], custom_name="Pipeline 2"), - dummy_binary_pipeline_class({})]) + super().__init__( + parameters={"n_jobs": n_jobs}, + component_obj=None, + random_seed=random_seed, + ) + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + n_jobs=3, + max_batches=2, + allowed_pipelines=[ + BinaryClassificationPipeline( + [MockEstimatorWithNJobs], custom_name="Pipeline 1" + ), + BinaryClassificationPipeline( + [MockEstimatorWithNJobs], custom_name="Pipeline 2" + ), + dummy_binary_pipeline_class({}), + ], + ) automl.search() for parameters in automl.full_rankings.parameters: if "Mock Classifier with njobs" in parameters: assert parameters["Mock Classifier with njobs"]["n_jobs"] == 3 else: - assert all("n_jobs" not in component_params for component_params in parameters.values()) + assert all( + "n_jobs" not in component_params + for component_params in parameters.values() + ) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_automl_ensembling_false(mock_fit, mock_score, X_y_binary): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 1.0} - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time='60 seconds', max_batches=20, ensembling=False) + mock_score.return_value = {"Log Loss Binary": 1.0} + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_time="60 seconds", + max_batches=20, + ensembling=False, + ) automl.search() - assert not automl.rankings['pipeline_name'].str.contains('Ensemble').any() + assert not automl.rankings["pipeline_name"].str.contains("Ensemble").any() -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_search_with_text(mock_fit, mock_score): X = pd.DataFrame( - {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!'], - 'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living', - 'Red, the blood of angry men - black, the dark of ages past', - 'do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'Red, the blood of angry men - black, the dark of ages past', - 'It was red and yellow and green and brown and scarlet and black and ochre and peach and ruby and olive and violet and fawn...'] - }) + { + "col_1": [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + ], + "col_2": [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living", + "Red, the blood of angry men - black, the dark of ages past", + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "Red, the blood of angry men - black, the dark of ages past", + "It was red and yellow and green and brown and scarlet and black and ochre and peach and ruby and olive and violet and fawn...", + ], + } + ) y = [0, 1, 1, 0, 1, 0] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") automl.search() - assert automl.rankings['pipeline_name'][1:].str.contains('Text').all() - - -@pytest.mark.parametrize("problem_type,pipeline_name,ensemble_name", - [('binary', 'Stacked Ensemble Classification Pipeline', 'Stacked Ensemble Classifier'), - ('multiclass', 'Stacked Ensemble Classification Pipeline', 'Stacked Ensemble Classifier'), - ('regression', 'Stacked Ensemble Regression Pipeline', 'Stacked Ensemble Regressor')]) + assert automl.rankings["pipeline_name"][1:].str.contains("Text").all() + + +@pytest.mark.parametrize( + "problem_type,pipeline_name,ensemble_name", + [ + ( + "binary", + "Stacked Ensemble Classification Pipeline", + "Stacked Ensemble Classifier", + ), + ( + "multiclass", + "Stacked Ensemble Classification Pipeline", + "Stacked Ensemble Classifier", + ), + ( + "regression", + "Stacked Ensemble Regression Pipeline", + "Stacked Ensemble Regressor", + ), + ], +) @pytest.mark.parametrize("df_text", [True, False]) -@patch('evalml.automl.automl_algorithm.IterativeAlgorithm.__init__') -def test_search_with_text_and_ensembling(mock_iter, df_text, problem_type, pipeline_name, ensemble_name): +@patch("evalml.automl.automl_algorithm.IterativeAlgorithm.__init__") +def test_search_with_text_and_ensembling( + mock_iter, df_text, problem_type, pipeline_name, ensemble_name +): X_with_text = pd.DataFrame( - {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living', - 'Red, the blood of angry men - black, the dark of ages past', - 'do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'Red, the blood of angry men - black, the dark of ages past', - 'It was red and yellow and green and brown and scarlet and black and ochre and peach and ruby and olive and violet and fawn...'] - }) - X_no_text = pd.DataFrame({'col_1': [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3]}) + { + "col_1": [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living", + "Red, the blood of angry men - black, the dark of ages past", + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "Red, the blood of angry men - black, the dark of ages past", + "It was red and yellow and green and brown and scarlet and black and ochre and peach and ruby and olive and violet and fawn...", + ] + } + ) + X_no_text = pd.DataFrame({"col_1": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3]}) if df_text: X = X_with_text else: X = X_no_text - if problem_type == 'binary': + if problem_type == "binary": y = [0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] - elif problem_type == 'multiclass': + elif problem_type == "multiclass": y = [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] else: y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] mock_iter.return_value = None - _ = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, allowed_model_families=["random_forest", "decision_tree"], - max_batches=4, ensembling=True) + _ = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + allowed_model_families=["random_forest", "decision_tree"], + max_batches=4, + ensembling=True, + ) call_args = mock_iter.call_args_list[0][1] if df_text: - assert call_args['text_in_ensembling'] + assert call_args["text_in_ensembling"] else: - assert not call_args['text_in_ensembling'] + assert not call_args["text_in_ensembling"] -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_pipelines_per_batch(mock_fit, mock_score, X_y_binary): def total_pipelines(automl, num_batches, batch_size): total = 1 + len(automl.allowed_pipelines) - total += ((num_batches - 1) * batch_size) + total += (num_batches - 1) * batch_size return total X, y = X_y_binary # Checking for default of _pipelines_per_batch - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=2) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_batches=2) automl.search() assert automl._pipelines_per_batch == 5 assert automl._automl_algorithm.pipelines_per_batch == 5 assert total_pipelines(automl, 2, 5) == len(automl.full_rankings) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=1, _pipelines_per_batch=2) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_batches=1, + _pipelines_per_batch=2, + ) automl.search() assert automl._pipelines_per_batch == 2 assert automl._automl_algorithm.pipelines_per_batch == 2 assert total_pipelines(automl, 1, 2) == len(automl.full_rankings) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=2, _pipelines_per_batch=10) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_batches=2, + _pipelines_per_batch=10, + ) automl.search() assert automl._pipelines_per_batch == 10 assert automl._automl_algorithm.pipelines_per_batch == 10 assert total_pipelines(automl, 2, 10) == len(automl.full_rankings) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_respects_random_seed(mock_fit, mock_score, X_y_binary, dummy_classifier_estimator_class): +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_respects_random_seed( + mock_fit, mock_score, X_y_binary, dummy_classifier_estimator_class +): X, y = X_y_binary @@ -1995,40 +3046,66 @@ def __init__(self, parameters, random_seed=0): is_diff_random_seed = not (random_seed == 42) self.__class__.num_pipelines_init += 1 self.__class__.num_pipelines_different_seed += is_diff_random_seed - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + pipelines = [DummyPipeline({})] DummyPipeline.num_pipelines_different_seed = 0 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", allowed_pipelines=pipelines, - random_seed=42, max_iterations=10) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_pipelines=pipelines, + random_seed=42, + max_iterations=10, + ) automl.search() - assert DummyPipeline.num_pipelines_different_seed == 0 and DummyPipeline.num_pipelines_init + assert ( + DummyPipeline.num_pipelines_different_seed == 0 + and DummyPipeline.num_pipelines_init + ) -@pytest.mark.parametrize("callback", [log_error_callback, silent_error_callback, raise_error_callback]) -@pytest.mark.parametrize("error_type", ['fit', "mean_cv_score", 'fit-single']) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_error_callback(mock_fit, mock_score, error_type, callback, X_y_binary, caplog): +@pytest.mark.parametrize( + "callback", [log_error_callback, silent_error_callback, raise_error_callback] +) +@pytest.mark.parametrize("error_type", ["fit", "mean_cv_score", "fit-single"]) +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.8}, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_error_callback( + mock_fit, mock_score, error_type, callback, X_y_binary, caplog +): X, y = X_y_binary if error_type == "mean_cv_score": msg = "Score Error!" mock_score.side_effect = Exception(msg) - elif error_type == 'fit': + elif error_type == "fit": mock_score.return_value = {"Log Loss Binary": 0.8} - msg = 'all your model are belong to us' + msg = "all your model are belong to us" mock_fit.side_effect = Exception(msg) else: # throw exceptions for only one pipeline mock_score.return_value = {"Log Loss Binary": 0.8} - msg = 'all your model are belong to us' + msg = "all your model are belong to us" mock_fit.side_effect = [Exception(msg)] * 3 + [None] * 100 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", error_callback=callback, train_best_pipeline=False, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + error_callback=callback, + train_best_pipeline=False, + n_jobs=1, + ) if callback in [log_error_callback, silent_error_callback]: exception = AutoMLSearchException match = "All pipelines in the current AutoML batch produced a score of np.nan on the primary objective" @@ -2036,7 +3113,10 @@ def test_automl_error_callback(mock_fit, mock_score, error_type, callback, X_y_b exception = Exception match = msg - if error_type == 'fit-single' and callback in [silent_error_callback, log_error_callback]: + if error_type == "fit-single" and callback in [ + silent_error_callback, + log_error_callback, + ]: automl.search() else: with pytest.raises(exception, match=match): @@ -2052,120 +3132,201 @@ def test_automl_error_callback(mock_fit, mock_score, error_type, callback, X_y_b assert msg in caplog.text -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_woodwork_user_types_preserved(mock_binary_fit, mock_binary_score, - mock_multi_fit, mock_multi_score, - mock_regression_fit, mock_regression_score, problem_type, - X_y_binary, X_y_multi, X_y_regression): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_woodwork_user_types_preserved( + mock_binary_fit, + mock_binary_score, + mock_multi_fit, + mock_multi_score, + mock_regression_fit, + mock_regression_score, + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary mock_fit = mock_binary_fit mock_score = mock_binary_score - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi mock_fit = mock_multi_fit mock_score = mock_multi_score - mock_score.return_value = {'Log Loss Multiclass': 1.0} + mock_score.return_value = {"Log Loss Multiclass": 1.0} elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression mock_fit = mock_regression_fit mock_score = mock_regression_score - mock_score.return_value = {'R2': 1.0} + mock_score.return_value = {"R2": 1.0} X = pd.DataFrame(X) new_col = np.zeros(len(X)) - new_col[:int(len(new_col) / 2)] = 1 - X['cat col'] = pd.Series(new_col) - X['num col'] = pd.Series(new_col) - X['text col'] = pd.Series([f"{num}" for num in range(len(new_col))]) - X.ww.init(semantic_tags={'cat col': 'category', 'num col': 'numeric'}, - logical_types={'cat col': 'Categorical', 'num col': 'Integer', 'text col': 'NaturalLanguage'}) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, max_batches=5) + new_col[: int(len(new_col) / 2)] = 1 + X["cat col"] = pd.Series(new_col) + X["num col"] = pd.Series(new_col) + X["text col"] = pd.Series([f"{num}" for num in range(len(new_col))]) + X.ww.init( + semantic_tags={"cat col": "category", "num col": "numeric"}, + logical_types={ + "cat col": "Categorical", + "num col": "Integer", + "text col": "NaturalLanguage", + }, + ) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type=problem_type, max_batches=5 + ) automl.search() for arg in mock_fit.call_args[0]: assert isinstance(arg, (pd.DataFrame, pd.Series)) if isinstance(arg, pd.DataFrame): - assert arg.ww.semantic_tags['cat col'] == {'category'} - assert arg.ww.logical_types['cat col'] == ww.logical_types.Categorical - assert arg.ww.semantic_tags['num col'] == {'numeric'} - assert arg.ww.logical_types['num col'] == ww.logical_types.Integer - assert arg.ww.semantic_tags['text col'] == set() - assert arg.ww.logical_types['text col'] == ww.logical_types.NaturalLanguage + assert arg.ww.semantic_tags["cat col"] == {"category"} + assert arg.ww.logical_types["cat col"] == ww.logical_types.Categorical + assert arg.ww.semantic_tags["num col"] == {"numeric"} + assert arg.ww.logical_types["num col"] == ww.logical_types.Integer + assert arg.ww.semantic_tags["text col"] == set() + assert arg.ww.logical_types["text col"] == ww.logical_types.NaturalLanguage for arg in mock_score.call_args[0]: assert isinstance(arg, (pd.DataFrame, pd.Series)) if isinstance(arg, pd.DataFrame): - assert arg.ww.semantic_tags['cat col'] == {'category'} - assert arg.ww.logical_types['cat col'] == ww.logical_types.Categorical - assert arg.ww.semantic_tags['num col'] == {'numeric'} - assert arg.ww.logical_types['num col'] == ww.logical_types.Integer - assert arg.ww.semantic_tags['text col'] == set() - assert arg.ww.logical_types['text col'] == ww.logical_types.NaturalLanguage + assert arg.ww.semantic_tags["cat col"] == {"category"} + assert arg.ww.logical_types["cat col"] == ww.logical_types.Categorical + assert arg.ww.semantic_tags["num col"] == {"numeric"} + assert arg.ww.logical_types["num col"] == ww.logical_types.Integer + assert arg.ww.semantic_tags["text col"] == set() + assert arg.ww.logical_types["text col"] == ww.logical_types.NaturalLanguage def test_automl_validates_problem_configuration(X_y_binary): X, y = X_y_binary - assert AutoMLSearch(X_train=X, y_train=y, problem_type="binary").problem_configuration == {} - assert AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass").problem_configuration == {} - assert AutoMLSearch(X_train=X, y_train=y, problem_type="regression").problem_configuration == {} + assert ( + AutoMLSearch(X_train=X, y_train=y, problem_type="binary").problem_configuration + == {} + ) + assert ( + AutoMLSearch( + X_train=X, y_train=y, problem_type="multiclass" + ).problem_configuration + == {} + ) + assert ( + AutoMLSearch( + X_train=X, y_train=y, problem_type="regression" + ).problem_configuration + == {} + ) msg = "user_parameters must be a dict containing values for at least the date_index, gap, and max_delay parameters" with pytest.raises(ValueError, match=msg): AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression") with pytest.raises(ValueError, match=msg): - AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration={"gap": 3}) + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="time series regression", + problem_configuration={"gap": 3}, + ) with pytest.raises(ValueError, match=msg): - AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration={"max_delay": 2, "gap": 3}) - - problem_config = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", - problem_configuration={"date_index": "Date", "max_delay": 2, "gap": 3}).problem_configuration + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="time series regression", + problem_configuration={"max_delay": 2, "gap": 3}, + ) + + problem_config = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="time series regression", + problem_configuration={"date_index": "Date", "max_delay": 2, "gap": 3}, + ).problem_configuration assert problem_config == {"date_index": "Date", "max_delay": 2, "gap": 3} -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") def test_automl_best_pipeline(mock_optimize, X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', train_best_pipeline=False, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", train_best_pipeline=False, n_jobs=1 + ) automl.search() with pytest.raises(PipelineNotYetFittedError, match="not fitted"): automl.best_pipeline.predict(X) mock_optimize.return_value = 0.62 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', optimize_thresholds=False, objective="Accuracy Binary", n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + optimize_thresholds=False, + objective="Accuracy Binary", + n_jobs=1, + ) automl.search() automl.best_pipeline.predict(X) assert automl.best_pipeline.threshold == 0.5 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', optimize_thresholds=True, objective="Log Loss Binary", n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + optimize_thresholds=True, + objective="Log Loss Binary", + n_jobs=1, + ) automl.search() automl.best_pipeline.predict(X) assert automl.best_pipeline.threshold is None - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', optimize_thresholds=True, objective="Accuracy Binary", n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + optimize_thresholds=True, + objective="Accuracy Binary", + n_jobs=1, + ) automl.search() automl.best_pipeline.predict(X) assert automl.best_pipeline.threshold == 0.62 -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -def test_automl_data_splitter_consistent(mock_binary_score, mock_binary_fit, mock_multi_score, mock_multi_fit, - mock_regression_score, mock_regression_fit, problem_type, - X_y_binary, X_y_multi, X_y_regression): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +def test_automl_data_splitter_consistent( + mock_binary_score, + mock_binary_fit, + mock_multi_score, + mock_multi_fit, + mock_regression_score, + mock_regression_fit, + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary @@ -2178,47 +3339,81 @@ def test_automl_data_splitter_consistent(mock_binary_score, mock_binary_fit, moc data_splitters = [] random_seed = [0, 0, 1] for seed in random_seed: - a = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, random_seed=seed, max_iterations=1) + a = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + random_seed=seed, + max_iterations=1, + ) a.search() - data_splitters.append([[set(train), set(test)] for train, test in a.data_splitter.split(X, y)]) + data_splitters.append( + [[set(train), set(test)] for train, test in a.data_splitter.split(X, y)] + ) # append split from last random state again, should be referencing same datasplit object - data_splitters.append([[set(train), set(test)] for train, test in a.data_splitter.split(X, y)]) + data_splitters.append( + [[set(train), set(test)] for train, test in a.data_splitter.split(X, y)] + ) assert data_splitters[0] == data_splitters[1] assert data_splitters[1] != data_splitters[2] assert data_splitters[2] == data_splitters[3] -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_automl_rerun(mock_fit, mock_score, X_y_binary, caplog): - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} msg = "AutoMLSearch.search() has already been run and will not run again on the same instance" X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", train_best_pipeline=False, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", train_best_pipeline=False, n_jobs=1 + ) automl.search() assert msg not in caplog.text automl.search() assert msg in caplog.text -@patch('evalml.pipelines.TimeSeriesRegressionPipeline.fit') -@patch('evalml.pipelines.TimeSeriesRegressionPipeline.score') -def test_timeseries_baseline_init_with_correct_gap_max_delay(mock_fit, mock_score, X_y_regression): +@patch("evalml.pipelines.TimeSeriesRegressionPipeline.fit") +@patch("evalml.pipelines.TimeSeriesRegressionPipeline.score") +def test_timeseries_baseline_init_with_correct_gap_max_delay( + mock_fit, mock_score, X_y_regression +): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", - problem_configuration={"date_index": None, "gap": 6, "max_delay": 3}, max_iterations=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="time series regression", + problem_configuration={"date_index": None, "gap": 6, "max_delay": 3}, + max_iterations=1, + ) automl.search() # Best pipeline is baseline pipeline because we only run one iteration - assert automl.best_pipeline.parameters == {"pipeline": {"date_index": None, "gap": 6, "max_delay": 3}, - "Time Series Baseline Estimator": {"date_index": None, "gap": 6, "max_delay": 3}} + assert automl.best_pipeline.parameters == { + "pipeline": {"date_index": None, "gap": 6, "max_delay": 3}, + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 6, + "max_delay": 3, + }, + } -@pytest.mark.parametrize('problem_type', [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.REGRESSION]) -def test_automl_does_not_include_positive_only_objectives_by_default(problem_type, X_y_regression): +@pytest.mark.parametrize( + "problem_type", + [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.REGRESSION, + ], +) +def test_automl_does_not_include_positive_only_objectives_by_default( + problem_type, X_y_regression +): X, y = X_y_regression @@ -2228,216 +3423,386 @@ def test_automl_does_not_include_positive_only_objectives_by_default(problem_typ if objective_class.positive_only: only_positive.append(objective_class) - search = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, - problem_configuration={"date_index": None, 'gap': 0, 'max_delay': 0}) + search = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + problem_configuration={"date_index": None, "gap": 0, "max_delay": 0}, + ) assert search.objective not in only_positive assert all([obj not in only_positive for obj in search.additional_objectives]) -@pytest.mark.parametrize('non_core_objective', get_non_core_objectives()) +@pytest.mark.parametrize("non_core_objective", get_non_core_objectives()) def test_automl_validate_objective(non_core_objective, X_y_regression): X, y = X_y_regression - with pytest.raises(ValueError, match='is not allowed in AutoML!'): - AutoMLSearch(X_train=X, y_train=y, problem_type=non_core_objective.problem_types[0], - objective=non_core_objective.name) + with pytest.raises(ValueError, match="is not allowed in AutoML!"): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type=non_core_objective.problem_types[0], + objective=non_core_objective.name, + ) - with pytest.raises(ValueError, match='is not allowed in AutoML!'): - AutoMLSearch(X_train=X, y_train=y, problem_type=non_core_objective.problem_types[0], - additional_objectives=[non_core_objective.name]) + with pytest.raises(ValueError, match="is not allowed in AutoML!"): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type=non_core_objective.problem_types[0], + additional_objectives=[non_core_objective.name], + ) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_automl_pipeline_params_simple(mock_fit, mock_score, X_y_binary): - mock_score.return_value = {'Log Loss Binary': 1.0} + mock_score.return_value = {"Log Loss Binary": 1.0} X, y = X_y_binary - params = {"Imputer": {"numeric_impute_strategy": "most_frequent"}, - "Logistic Regression Classifier": {"C": 10, - "penalty": 'l2'}, - "Elastic Net Classifier": {"alpha": 0.75, - "l1_ratio": 0.2}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", pipeline_parameters=params, n_jobs=1) + params = { + "Imputer": {"numeric_impute_strategy": "most_frequent"}, + "Logistic Regression Classifier": {"C": 10, "penalty": "l2"}, + "Elastic Net Classifier": {"alpha": 0.75, "l1_ratio": 0.2}, + } + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + pipeline_parameters=params, + n_jobs=1, + ) automl.search() for i, row in automl.rankings.iterrows(): - if 'Imputer' in row['parameters']: - assert row['parameters']['Imputer']['numeric_impute_strategy'] == 'most_frequent' - if 'Logistic Regression Classifier' in row['parameters']: - assert row['parameters']['Logistic Regression Classifier']['C'] == 10 - assert row['parameters']['Logistic Regression Classifier']['penalty'] == 'l2' - if 'Elastic Net Classifier' in row['parameters']: - assert row['parameters']['Elastic Net Classifier']['alpha'] == 0.75 - assert row['parameters']['Elastic Net Classifier']['l1_ratio'] == 0.2 - - -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.RegressionPipeline.score') + if "Imputer" in row["parameters"]: + assert ( + row["parameters"]["Imputer"]["numeric_impute_strategy"] + == "most_frequent" + ) + if "Logistic Regression Classifier" in row["parameters"]: + assert row["parameters"]["Logistic Regression Classifier"]["C"] == 10 + assert ( + row["parameters"]["Logistic Regression Classifier"]["penalty"] == "l2" + ) + if "Elastic Net Classifier" in row["parameters"]: + assert row["parameters"]["Elastic Net Classifier"]["alpha"] == 0.75 + assert row["parameters"]["Elastic Net Classifier"]["l1_ratio"] == 0.2 + + +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch("evalml.pipelines.RegressionPipeline.score") def test_automl_pipeline_params_multiple(mock_score, mock_fit, X_y_regression): - mock_score.return_value = {'R2': 1.0} + mock_score.return_value = {"R2": 1.0} X, y = X_y_regression - hyperparams = {'Imputer': {'numeric_impute_strategy': Categorical(['median', 'most_frequent'])}, - 'Decision Tree Regressor': {'max_depth': Categorical([17, 18, 19]), 'max_features': Categorical(['auto'])}, - 'Elastic Net Regressor': {"alpha": Real(0, 0.5), "l1_ratio": Categorical((0.01, 0.02, 0.03))}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', custom_hyperparameters=hyperparams, n_jobs=1) + hyperparams = { + "Imputer": { + "numeric_impute_strategy": Categorical(["median", "most_frequent"]) + }, + "Decision Tree Regressor": { + "max_depth": Categorical([17, 18, 19]), + "max_features": Categorical(["auto"]), + }, + "Elastic Net Regressor": { + "alpha": Real(0, 0.5), + "l1_ratio": Categorical((0.01, 0.02, 0.03)), + }, + } + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + custom_hyperparameters=hyperparams, + n_jobs=1, + ) automl.search() for i, row in automl.rankings.iterrows(): - if 'Imputer' in row['parameters']: - assert row['parameters']['Imputer']['numeric_impute_strategy'] == Categorical(['median', 'most_frequent']).rvs(random_state=automl.random_seed) - if 'Decision Tree Regressor' in row['parameters']: - assert row['parameters']['Decision Tree Regressor']['max_depth'] == Categorical([17, 18, 19]).rvs(random_state=automl.random_seed) - assert row['parameters']['Decision Tree Regressor']['max_features'] == 'auto' - if 'Elastic Net Regressor' in row['parameters']: - assert 0 < row['parameters']['Elastic Net Regressor']['alpha'] < 0.5 - assert row['parameters']['Elastic Net Regressor']['l1_ratio'] == Categorical((0.01, 0.02, 0.03)).rvs(random_state=automl.random_seed) + if "Imputer" in row["parameters"]: + assert row["parameters"]["Imputer"][ + "numeric_impute_strategy" + ] == Categorical(["median", "most_frequent"]).rvs( + random_state=automl.random_seed + ) + if "Decision Tree Regressor" in row["parameters"]: + assert row["parameters"]["Decision Tree Regressor"][ + "max_depth" + ] == Categorical([17, 18, 19]).rvs(random_state=automl.random_seed) + assert ( + row["parameters"]["Decision Tree Regressor"]["max_features"] == "auto" + ) + if "Elastic Net Regressor" in row["parameters"]: + assert 0 < row["parameters"]["Elastic Net Regressor"]["alpha"] < 0.5 + assert row["parameters"]["Elastic Net Regressor"][ + "l1_ratio" + ] == Categorical((0.01, 0.02, 0.03)).rvs(random_state=automl.random_seed) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.02}) -def test_automl_respects_pipeline_parameters_with_duplicate_components(mock_score, mock_fit, X_y_binary): +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.02}, +) +def test_automl_respects_pipeline_parameters_with_duplicate_components( + mock_score, mock_fit, X_y_binary +): X, y = X_y_binary # Pass the input of the first imputer to the second imputer - component_graph_dict = {"Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"]} - pipeline_dict = BinaryClassificationPipeline(component_graph_dict, custom_name="Pipeline from dict") + component_graph_dict = { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"], + } + pipeline_dict = BinaryClassificationPipeline( + component_graph_dict, custom_name="Pipeline from dict" + ) component_graph_linear = ["Imputer", "Imputer", "Random Forest Classifier"] pipeline_linear = BinaryClassificationPipeline(component_graph_linear) - automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_dict, pipeline_linear], - pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}, - "Imputer_1": {"numeric_impute_strategy": "median"}}, - max_batches=3) + automl = AutoMLSearch( + X, + y, + problem_type="binary", + allowed_pipelines=[pipeline_dict, pipeline_linear], + pipeline_parameters={ + "Imputer": {"numeric_impute_strategy": "most_frequent"}, + "Imputer_1": {"numeric_impute_strategy": "median"}, + }, + max_batches=3, + ) automl.search() for row in automl.full_rankings.iloc[1:3].parameters: assert row["Imputer"]["numeric_impute_strategy"] == "most_frequent" assert row["Imputer_1"]["numeric_impute_strategy"] == "median" - component_graph_dict = {"One Hot Encoder": ["One Hot Encoder"], - "One Hot Encoder_1": ["One Hot Encoder", "One Hot Encoder"], - "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder_1"]} - pipeline_dict = BinaryClassificationPipeline(component_graph_dict, custom_name="Pipeline from dict") - - component_graph_linear = ["One Hot Encoder", "One Hot Encoder", "Random Forest Classifier"] + component_graph_dict = { + "One Hot Encoder": ["One Hot Encoder"], + "One Hot Encoder_1": ["One Hot Encoder", "One Hot Encoder"], + "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder_1"], + } + pipeline_dict = BinaryClassificationPipeline( + component_graph_dict, custom_name="Pipeline from dict" + ) + + component_graph_linear = [ + "One Hot Encoder", + "One Hot Encoder", + "Random Forest Classifier", + ] pipeline_linear = BinaryClassificationPipeline(component_graph_linear) - automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_linear, pipeline_dict], - pipeline_parameters={"One Hot Encoder": {"top_n": 15}, - "One Hot Encoder_1": {"top_n": 25}}, - max_batches=3) + automl = AutoMLSearch( + X, + y, + problem_type="binary", + allowed_pipelines=[pipeline_linear, pipeline_dict], + pipeline_parameters={ + "One Hot Encoder": {"top_n": 15}, + "One Hot Encoder_1": {"top_n": 25}, + }, + max_batches=3, + ) automl.search() for row in automl.full_rankings.iloc[1:3].parameters: assert row["One Hot Encoder"]["top_n"] == 15 assert row["One Hot Encoder_1"]["top_n"] == 25 -@pytest.mark.parametrize('graph_type', ['linear', 'dict']) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.02}) -def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_components(mock_score, mock_fit, graph_type, X_y_binary): +@pytest.mark.parametrize("graph_type", ["linear", "dict"]) +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.02}, +) +def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_components( + mock_score, mock_fit, graph_type, X_y_binary +): X, y = X_y_binary - if graph_type == 'linear': - custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["mean"])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}} + if graph_type == "linear": + custom_hyperparameters = { + "Imputer": {"numeric_impute_strategy": Categorical(["mean"])}, + "Imputer_1": { + "numeric_impute_strategy": Categorical(["most_frequent", "mean"]) + }, + "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}, + } component_graph = ["Imputer", "Imputer", "Random Forest Classifier"] pipeline_ = BinaryClassificationPipeline(component_graph) else: - custom_hyperparameters = {"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", 'mean'])}, - "Imputer_1": {"numeric_impute_strategy": Categorical(["median", 'mean'])}, - "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}} - component_graph = {"Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"]} - pipeline_ = BinaryClassificationPipeline(component_graph, custom_name="Pipeline from dict") - - automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_], custom_hyperparameters=custom_hyperparameters, max_batches=5) + custom_hyperparameters = { + "Imputer": { + "numeric_impute_strategy": Categorical(["most_frequent", "mean"]) + }, + "Imputer_1": {"numeric_impute_strategy": Categorical(["median", "mean"])}, + "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}, + } + component_graph = { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"], + } + pipeline_ = BinaryClassificationPipeline( + component_graph, custom_name="Pipeline from dict" + ) + + automl = AutoMLSearch( + X, + y, + problem_type="binary", + allowed_pipelines=[pipeline_], + custom_hyperparameters=custom_hyperparameters, + max_batches=5, + ) automl.search() for i, row in automl.full_rankings.iterrows(): - if "Mode Baseline Binary" in row['pipeline_name']: + if "Mode Baseline Binary" in row["pipeline_name"]: continue if row["pipeline_name"] == "Pipeline Dict": - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in {"most_frequent", "mean"} - assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in {"median", "mean"} - assert row["parameters"]["Random Forest Classifier"]["n_estimators"] in {50, 100} + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in { + "most_frequent", + "mean", + } + assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in { + "median", + "mean", + } + assert row["parameters"]["Random Forest Classifier"]["n_estimators"] in { + 50, + 100, + } if row["pipeline_name"] == "Pipe Line Linear": assert row["parameters"]["Imputer"]["numeric_impute_strategy"] == "mean" - assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in {"most_frequent", "mean"} - assert row["parameters"]["Random Forest Classifier"]["n_estimators"] in {100, 125} + assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in { + "most_frequent", + "mean", + } + assert row["parameters"]["Random Forest Classifier"]["n_estimators"] in { + 100, + 125, + } -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.02}) -def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams(mock_score, mock_fit, X_y_binary): +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.02}, +) +def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams( + mock_score, mock_fit, X_y_binary +): X, y = X_y_binary - component_graph = {"Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], - "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"]} - pipeline_one = BinaryClassificationPipeline(component_graph, custom_name="Pipe Line One") - pipeline_two = BinaryClassificationPipeline(["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], - custom_name="Pipe Line Two") - - pipeline_three = BinaryClassificationPipeline(["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], - custom_name="Pipe Line Three") - - automl = AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_one, pipeline_two, pipeline_three], - pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}}, - custom_hyperparameters={"One Hot Encoder": {"top_n": Categorical([12, 10])}, - "Imputer": {"numeric_impute_strategy": Categorical(["median", "most_frequent"])}}, - max_batches=4) + component_graph = { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], + "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"], + } + pipeline_one = BinaryClassificationPipeline( + component_graph, custom_name="Pipe Line One" + ) + pipeline_two = BinaryClassificationPipeline( + ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], + custom_name="Pipe Line Two", + ) + + pipeline_three = BinaryClassificationPipeline( + ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], + custom_name="Pipe Line Three", + ) + + automl = AutoMLSearch( + X, + y, + problem_type="binary", + allowed_pipelines=[pipeline_one, pipeline_two, pipeline_three], + pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}}, + custom_hyperparameters={ + "One Hot Encoder": {"top_n": Categorical([12, 10])}, + "Imputer": { + "numeric_impute_strategy": Categorical(["median", "most_frequent"]) + }, + }, + max_batches=4, + ) automl.search() for i, row in automl.full_rankings.iterrows(): - if "Mode Baseline Binary" in row['pipeline_name']: + if "Mode Baseline Binary" in row["pipeline_name"]: continue - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in ["most_frequent", "median"] - assert 10 <= row['parameters']["One Hot Encoder"]["top_n"] <= 12 + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [ + "most_frequent", + "median", + ] + assert 10 <= row["parameters"]["One Hot Encoder"]["top_n"] <= 12 -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") def test_automl_pipeline_params_kwargs(mock_fit, mock_score, X_y_multi): - mock_score.return_value = {'Log Loss Multiclass': 1.0} + mock_score.return_value = {"Log Loss Multiclass": 1.0} X, y = X_y_multi - hyperparams = {'Imputer': {'numeric_impute_strategy': Categorical(['most_frequent'])}, - 'Decision Tree Classifier': {'max_depth': Integer(1, 2), 'ccp_alpha': Real(0.1, 0.5)}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', custom_hyperparameters=hyperparams, - allowed_model_families=[ModelFamily.DECISION_TREE], n_jobs=1) + hyperparams = { + "Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])}, + "Decision Tree Classifier": { + "max_depth": Integer(1, 2), + "ccp_alpha": Real(0.1, 0.5), + }, + } + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + custom_hyperparameters=hyperparams, + allowed_model_families=[ModelFamily.DECISION_TREE], + n_jobs=1, + ) automl.search() for i, row in automl.rankings.iterrows(): - if 'Imputer' in row['parameters']: - assert row['parameters']['Imputer']['numeric_impute_strategy'] == 'most_frequent' - if 'Decision Tree Classifier' in row['parameters']: - assert 0.1 < row['parameters']['Decision Tree Classifier']['ccp_alpha'] < 0.5 - assert row['parameters']['Decision Tree Classifier']['max_depth'] == 1 + if "Imputer" in row["parameters"]: + assert ( + row["parameters"]["Imputer"]["numeric_impute_strategy"] + == "most_frequent" + ) + if "Decision Tree Classifier" in row["parameters"]: + assert ( + 0.1 < row["parameters"]["Decision Tree Classifier"]["ccp_alpha"] < 0.5 + ) + assert row["parameters"]["Decision Tree Classifier"]["max_depth"] == 1 @pytest.mark.parametrize("random_seed", [0, 1, 9]) -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") def test_automl_pipeline_random_seed(mock_fit, mock_score, random_seed, X_y_multi): - mock_score.return_value = {'Log Loss Multiclass': 1.0} + mock_score.return_value = {"Log Loss Multiclass": 1.0} X, y = X_y_multi - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', random_seed=random_seed, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + random_seed=random_seed, + n_jobs=1, + ) automl.search() for i, row in automl.rankings.iterrows(): - if 'Base' not in list(row['parameters'].keys())[0]: - assert automl.get_pipeline(row['id']).random_seed == random_seed + if "Base" not in list(row["parameters"].keys())[0]: + assert automl.get_pipeline(row["id"]).random_seed == random_seed def test_automl_check_for_high_variance(X_y_binary, dummy_binary_pipeline_class): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") cv_scores = pd.Series([1, 1, 1]) pipeline = dummy_binary_pipeline_class(parameters={}) - assert not automl._check_for_high_variance(pipeline, cv_scores.mean(), cv_scores.std()) + assert not automl._check_for_high_variance( + pipeline, cv_scores.mean(), cv_scores.std() + ) cv_scores = pd.Series([0, 0, 0]) - assert not automl._check_for_high_variance(pipeline, cv_scores.mean(), cv_scores.std()) + assert not automl._check_for_high_variance( + pipeline, cv_scores.mean(), cv_scores.std() + ) cv_scores = pd.Series([0, 1, np.nan, np.nan]) assert automl._check_for_high_variance(pipeline, cv_scores.mean(), cv_scores.std()) @@ -2449,12 +3814,15 @@ def test_automl_check_for_high_variance(X_y_binary, dummy_binary_pipeline_class) assert automl._check_for_high_variance(pipeline, cv_scores.mean(), cv_scores.std()) -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_automl_check_high_variance_logs_warning(mock_fit_binary, X_y_binary, caplog): X, y = X_y_binary - with patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 1}): - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + with patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 1}, + ): + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") automl.search() out = caplog.text assert "High coefficient of variation" not in out @@ -2462,8 +3830,13 @@ def test_automl_check_high_variance_logs_warning(mock_fit_binary, X_y_binary, ca caplog.clear() desired_score_values = [{"Log Loss Binary": i} for i in [1, 2, 10] * 2] - with patch('evalml.pipelines.BinaryClassificationPipeline.score', side_effect=desired_score_values): - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=2) + with patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + side_effect=desired_score_values, + ): + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_iterations=2 + ) automl.search() out = caplog.text assert "High coefficient of variation" in out @@ -2471,27 +3844,53 @@ def test_automl_check_high_variance_logs_warning(mock_fit_binary, X_y_binary, ca def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y = X_y_binary - pipeline_1 = BinaryClassificationPipeline(component_graph=["Imputer", "Random Forest Classifier"], custom_name="Custom Pipeline") - pipeline_2 = BinaryClassificationPipeline(component_graph=["Imputer", "Logistic Regression Classifier"], custom_name="Custom Pipeline") - pipeline_3 = BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], custom_name="My Pipeline 3") - pipeline_4 = BinaryClassificationPipeline(component_graph=["Random Forest Classifier"], custom_name="My Pipeline 3") - - with pytest.raises(ValueError, - match="All pipeline names must be unique. The name 'Custom Pipeline' was repeated."): - AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_1, pipeline_2, pipeline_3]) - - with pytest.raises(ValueError, - match="All pipeline names must be unique. The names 'Custom Pipeline', 'My Pipeline 3' were repeated."): - AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[pipeline_1, pipeline_2, pipeline_3, pipeline_4]) - - -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_train_batch_score_batch(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): - + pipeline_1 = BinaryClassificationPipeline( + component_graph=["Imputer", "Random Forest Classifier"], + custom_name="Custom Pipeline", + ) + pipeline_2 = BinaryClassificationPipeline( + component_graph=["Imputer", "Logistic Regression Classifier"], + custom_name="Custom Pipeline", + ) + pipeline_3 = BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], custom_name="My Pipeline 3" + ) + pipeline_4 = BinaryClassificationPipeline( + component_graph=["Random Forest Classifier"], custom_name="My Pipeline 3" + ) + + with pytest.raises( + ValueError, + match="All pipeline names must be unique. The name 'Custom Pipeline' was repeated.", + ): + AutoMLSearch( + X, + y, + problem_type="binary", + allowed_pipelines=[pipeline_1, pipeline_2, pipeline_3], + ) + + with pytest.raises( + ValueError, + match="All pipeline names must be unique. The names 'Custom Pipeline', 'My Pipeline 3' were repeated.", + ): + AutoMLSearch( + X, + y, + problem_type="binary", + allowed_pipelines=[pipeline_1, pipeline_2, pipeline_3, pipeline_4], + ) + + +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_train_batch_score_batch( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +): def make_dummy_pipeline(index): class Pipeline(dummy_binary_pipeline_class): custom_name = f"Pipeline {index}" + return Pipeline({}) pipelines = [make_dummy_pipeline(i) for i in range(3)] @@ -2506,9 +3905,15 @@ class Pipeline(dummy_binary_pipeline_class): fitted_pipelines = automl.train_pipelines(pipelines) assert fitted_pipelines.keys() == {"Pipeline 0", "Pipeline 2"} - score_effects = [{"Log Loss Binary": 0.1}, {"Log Loss Binary": 0.2}, {"Log Loss Binary": 0.3}] + score_effects = [ + {"Log Loss Binary": 0.1}, + {"Log Loss Binary": 0.2}, + {"Log Loss Binary": 0.3}, + ] mock_score.side_effect = score_effects - expected_scores = {f"Pipeline {i}": effect for i, effect in zip(range(3), score_effects)} + expected_scores = { + f"Pipeline {i}": effect for i, effect in zip(range(3), score_effects) + } scores = automl.score_pipelines(pipelines, X, y, ["Log Loss Binary"]) assert scores == expected_scores @@ -2517,8 +3922,14 @@ def test_train_batch_returns_trained_pipelines(X_y_binary): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") - rf_pipeline = BinaryClassificationPipeline(["Random Forest Classifier"], parameters={"Random Forest Classifier": {"n_jobs": 1}}) - lrc_pipeline = BinaryClassificationPipeline(["Logistic Regression Classifier"], parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + rf_pipeline = BinaryClassificationPipeline( + ["Random Forest Classifier"], + parameters={"Random Forest Classifier": {"n_jobs": 1}}, + ) + lrc_pipeline = BinaryClassificationPipeline( + ["Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ) pipelines = [rf_pipeline, lrc_pipeline] fitted_pipelines = automl.train_pipelines(pipelines) @@ -2534,40 +3945,78 @@ def test_train_batch_returns_trained_pipelines(X_y_binary): assert fitted_pipeline.parameters == original_pipeline.parameters -@pytest.mark.parametrize("pipeline_fit_side_effect", - [[None] * 6, [None, Exception("foo"), None], - [None, Exception("bar"), Exception("baz")], - [Exception("Everything"), Exception("is"), Exception("broken")]]) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.3}) -def test_train_batch_works(mock_score, pipeline_fit_side_effect, X_y_binary, - dummy_binary_pipeline_class, stackable_classifiers, caplog): - - exceptions_to_check = [str(e) for e in pipeline_fit_side_effect if isinstance(e, Exception)] +@pytest.mark.parametrize( + "pipeline_fit_side_effect", + [ + [None] * 6, + [None, Exception("foo"), None], + [None, Exception("bar"), Exception("baz")], + [Exception("Everything"), Exception("is"), Exception("broken")], + ], +) +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.3}, +) +def test_train_batch_works( + mock_score, + pipeline_fit_side_effect, + X_y_binary, + dummy_binary_pipeline_class, + stackable_classifiers, + caplog, +): + + exceptions_to_check = [ + str(e) for e in pipeline_fit_side_effect if isinstance(e, Exception) + ] X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time=1, max_iterations=2, - train_best_pipeline=False, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_time=1, + max_iterations=2, + train_best_pipeline=False, + n_jobs=1, + ) def make_pipeline_name(index): class DummyPipeline(dummy_binary_pipeline_class): custom_name = f"Pipeline {index}" - return DummyPipeline({'Mock Classifier': {'a': index}}) - pipelines = [make_pipeline_name(i) for i in range(len(pipeline_fit_side_effect) - 1)] - input_pipelines = [BinaryClassificationPipeline([classifier]) for classifier in stackable_classifiers[:2]] - ensemble = BinaryClassificationPipeline([StackedEnsembleClassifier], - parameters={"Stacked Ensemble Classifier": {"input_pipelines": input_pipelines, "n_jobs": 1}}) + return DummyPipeline({"Mock Classifier": {"a": index}}) + + pipelines = [ + make_pipeline_name(i) for i in range(len(pipeline_fit_side_effect) - 1) + ] + input_pipelines = [ + BinaryClassificationPipeline([classifier]) + for classifier in stackable_classifiers[:2] + ] + ensemble = BinaryClassificationPipeline( + [StackedEnsembleClassifier], + parameters={ + "Stacked Ensemble Classifier": { + "input_pipelines": input_pipelines, + "n_jobs": 1, + } + }, + ) pipelines.append(ensemble) def train_batch_and_check(): caplog.clear() - with patch('evalml.pipelines.BinaryClassificationPipeline.fit') as mock_fit: + with patch("evalml.pipelines.BinaryClassificationPipeline.fit") as mock_fit: mock_fit.side_effect = pipeline_fit_side_effect trained_pipelines = automl.train_pipelines(pipelines) - assert len(trained_pipelines) == len(pipeline_fit_side_effect) - len(exceptions_to_check) + assert len(trained_pipelines) == len(pipeline_fit_side_effect) - len( + exceptions_to_check + ) assert mock_fit.call_count == len(pipeline_fit_side_effect) for exception in exceptions_to_check: assert exception in caplog.text @@ -2584,32 +4033,76 @@ def train_batch_and_check(): no_exception_scores = {"F1": 0.9, "AUC": 0.7, "Log Loss Binary": 0.25} -@pytest.mark.parametrize("pipeline_score_side_effect", - [[no_exception_scores] * 6, - [no_exception_scores, - PipelineScoreError(exceptions={"AUC": (Exception(), []), "Log Loss Binary": (Exception(), [])}, - scored_successfully={"F1": 0.2}), - no_exception_scores], - [no_exception_scores, - PipelineScoreError(exceptions={"AUC": (Exception(), []), "Log Loss Binary": (Exception(), [])}, - scored_successfully={"F1": 0.3}), - PipelineScoreError(exceptions={"AUC": (Exception(), []), "F1": (Exception(), [])}, - scored_successfully={"Log Loss Binary": 0.2})], - [PipelineScoreError(exceptions={"Log Loss Binary": (Exception(), []), "F1": (Exception(), [])}, - scored_successfully={"AUC": 0.6}), - PipelineScoreError(exceptions={"AUC": (Exception(), []), "Log Loss Binary": (Exception(), [])}, - scored_successfully={"F1": 0.2}), - PipelineScoreError(exceptions={"Log Loss Binary": (Exception(), [])}, - scored_successfully={"AUC": 0.2, "F1": 0.1})]]) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -def test_score_batch_works(mock_score, pipeline_score_side_effect, X_y_binary, - dummy_binary_pipeline_class, stackable_classifiers, caplog): +@pytest.mark.parametrize( + "pipeline_score_side_effect", + [ + [no_exception_scores] * 6, + [ + no_exception_scores, + PipelineScoreError( + exceptions={ + "AUC": (Exception(), []), + "Log Loss Binary": (Exception(), []), + }, + scored_successfully={"F1": 0.2}, + ), + no_exception_scores, + ], + [ + no_exception_scores, + PipelineScoreError( + exceptions={ + "AUC": (Exception(), []), + "Log Loss Binary": (Exception(), []), + }, + scored_successfully={"F1": 0.3}, + ), + PipelineScoreError( + exceptions={"AUC": (Exception(), []), "F1": (Exception(), [])}, + scored_successfully={"Log Loss Binary": 0.2}, + ), + ], + [ + PipelineScoreError( + exceptions={ + "Log Loss Binary": (Exception(), []), + "F1": (Exception(), []), + }, + scored_successfully={"AUC": 0.6}, + ), + PipelineScoreError( + exceptions={ + "AUC": (Exception(), []), + "Log Loss Binary": (Exception(), []), + }, + scored_successfully={"F1": 0.2}, + ), + PipelineScoreError( + exceptions={"Log Loss Binary": (Exception(), [])}, + scored_successfully={"AUC": 0.2, "F1": 0.1}, + ), + ], + ], +) +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +def test_score_batch_works( + mock_score, + pipeline_score_side_effect, + X_y_binary, + dummy_binary_pipeline_class, + stackable_classifiers, + caplog, +): exceptions_to_check = [] expected_scores = {} for i, e in enumerate(pipeline_score_side_effect): # Ensemble pipeline has different name - pipeline_name = f"Pipeline {i}" if i < len(pipeline_score_side_effect) - 1 else "Templated Pipeline" + pipeline_name = ( + f"Pipeline {i}" + if i < len(pipeline_score_side_effect) - 1 + else "Templated Pipeline" + ) scores = no_exception_scores if isinstance(e, PipelineScoreError): scores = {"F1": np.nan, "AUC": np.nan, "Log Loss Binary": np.nan} @@ -2620,27 +4113,47 @@ def test_score_batch_works(mock_score, pipeline_score_side_effect, X_y_binary, X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) def make_pipeline_name(index): class DummyPipeline(dummy_binary_pipeline_class): custom_name = f"Pipeline {index}" - return DummyPipeline({'Mock Classifier': {'a': index}}) - pipelines = [make_pipeline_name(i) for i in range(len(pipeline_score_side_effect) - 1)] - input_pipelines = [BinaryClassificationPipeline([classifier]) for classifier in stackable_classifiers[:2]] - ensemble = BinaryClassificationPipeline([StackedEnsembleClassifier], - parameters={"Stacked Ensemble Classifier": {"input_pipelines": input_pipelines, "n_jobs": 1}}, - custom_name="Templated Pipeline") + return DummyPipeline({"Mock Classifier": {"a": index}}) + + pipelines = [ + make_pipeline_name(i) for i in range(len(pipeline_score_side_effect) - 1) + ] + input_pipelines = [ + BinaryClassificationPipeline([classifier]) + for classifier in stackable_classifiers[:2] + ] + ensemble = BinaryClassificationPipeline( + [StackedEnsembleClassifier], + parameters={ + "Stacked Ensemble Classifier": { + "input_pipelines": input_pipelines, + "n_jobs": 1, + } + }, + custom_name="Templated Pipeline", + ) pipelines.append(ensemble) def score_batch_and_check(): caplog.clear() - with patch('evalml.pipelines.BinaryClassificationPipeline.score') as mock_score: + with patch("evalml.pipelines.BinaryClassificationPipeline.score") as mock_score: mock_score.side_effect = pipeline_score_side_effect - scores = automl.score_pipelines(pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"]) + scores = automl.score_pipelines( + pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"] + ) assert scores == expected_scores for exception in exceptions_to_check: assert exception in caplog.text @@ -2654,8 +4167,9 @@ def score_batch_and_check(): score_batch_and_check() -def test_train_pipelines_score_pipelines_raise_exception_with_duplicate_names(X_y_binary, dummy_binary_pipeline_class): - +def test_train_pipelines_score_pipelines_raise_exception_with_duplicate_names( + X_y_binary, dummy_binary_pipeline_class +): class Pipeline1(dummy_binary_pipeline_class): custom_name = "My Pipeline" @@ -2664,56 +4178,101 @@ class Pipeline2(dummy_binary_pipeline_class): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})]) - - with pytest.raises(ValueError, match="All pipeline names must be unique. The name 'My Pipeline' was repeated."): + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) + + with pytest.raises( + ValueError, + match="All pipeline names must be unique. The name 'My Pipeline' was repeated.", + ): automl.train_pipelines([Pipeline2({}), Pipeline1({})]) - with pytest.raises(ValueError, match="All pipeline names must be unique. The name 'My Pipeline' was repeated."): + with pytest.raises( + ValueError, + match="All pipeline names must be unique. The name 'My Pipeline' was repeated.", + ): automl.score_pipelines([Pipeline2({}), Pipeline1({})], X, y, None) -def test_score_batch_before_fitting_yields_error_nan_scores(X_y_binary, dummy_binary_pipeline_class, caplog): +def test_score_batch_before_fitting_yields_error_nan_scores( + X_y_binary, dummy_binary_pipeline_class, caplog +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})]) - - scored_pipelines = automl.score_pipelines([dummy_binary_pipeline_class({})], X, y, - objectives=["Log Loss Binary", F1()]) - assert scored_pipelines == {"Mock Binary Classification Pipeline": {"Log Loss Binary": np.nan, - "F1": np.nan}} + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) + + scored_pipelines = automl.score_pipelines( + [dummy_binary_pipeline_class({})], X, y, objectives=["Log Loss Binary", F1()] + ) + assert scored_pipelines == { + "Mock Binary Classification Pipeline": {"Log Loss Binary": np.nan, "F1": np.nan} + } assert "Score error for Mock Binary Classification Pipeline" in caplog.text assert "This LabelEncoder instance is not fitted yet." in caplog.text -def test_high_cv_check_no_warning_for_divide_by_zero(X_y_binary, dummy_binary_pipeline_class): +def test_high_cv_check_no_warning_for_divide_by_zero( + X_y_binary, dummy_binary_pipeline_class +): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") with pytest.warns(None) as warnings: - automl._check_for_high_variance(dummy_binary_pipeline_class({}), cv_mean=np.array([0.0]), - cv_std=np.array([0.1])) + automl._check_for_high_variance( + dummy_binary_pipeline_class({}), + cv_mean=np.array([0.0]), + cv_std=np.array([0.1]), + ) assert len(warnings) == 0 with pytest.warns(None) as warnings: # mean is 0 but std is not - automl._check_for_high_variance(dummy_binary_pipeline_class({}), - cv_mean=np.array([0.0, 1.0, -1.0]).mean(), cv_std=np.array([0.0, 1.0, -1.0]).std()) + automl._check_for_high_variance( + dummy_binary_pipeline_class({}), + cv_mean=np.array([0.0, 1.0, -1.0]).mean(), + cv_std=np.array([0.0, 1.0, -1.0]).std(), + ) assert len(warnings) == 0 -@pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -@patch('evalml.pipelines.RegressionPipeline.score', return_value={"R2": 0.3}) -@patch('evalml.pipelines.ClassificationPipeline.score', return_value={"Log Loss Multiclass": 0.3}) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.3}) -@patch('evalml.automl.engine.sequential_engine.train_pipeline') -def test_automl_supports_float_targets_for_classification(mock_train, mock_binary_score, mock_multi_score, mock_regression_score, - automl_type, X_y_binary, X_y_multi, X_y_regression, - dummy_binary_pipeline_class, - dummy_regression_pipeline_class, - dummy_multiclass_pipeline_class): +@pytest.mark.parametrize( + "automl_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +@patch("evalml.pipelines.RegressionPipeline.score", return_value={"R2": 0.3}) +@patch( + "evalml.pipelines.ClassificationPipeline.score", + return_value={"Log Loss Multiclass": 0.3}, +) +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.3}, +) +@patch("evalml.automl.engine.sequential_engine.train_pipeline") +def test_automl_supports_float_targets_for_classification( + mock_train, + mock_binary_score, + mock_multi_score, + mock_regression_score, + automl_type, + X_y_binary, + X_y_multi, + X_y_regression, + dummy_binary_pipeline_class, + dummy_regression_pipeline_class, + dummy_multiclass_pipeline_class, +): if automl_type == ProblemTypes.BINARY: X, y = X_y_binary y = pd.Series(y).map({0: -5.19, 1: 6.7}) @@ -2727,7 +4286,9 @@ def test_automl_supports_float_targets_for_classification(mock_train, mock_binar y = pd.Series(y) mock_train.return_value = dummy_regression_pipeline_class({}) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=automl_type, random_seed=0, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type=automl_type, random_seed=0, n_jobs=1 + ) automl.search() # Assert that we train pipeline on the original target, not the encoded one used in EngineBase for data splitting @@ -2736,59 +4297,106 @@ def test_automl_supports_float_targets_for_classification(mock_train, mock_binar pd.testing.assert_series_equal(mock_y, y, check_dtype=False) -@pytest.mark.parametrize("problem_type", [ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS]) +@pytest.mark.parametrize( + "problem_type", + [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ], +) def test_automl_issues_beta_warning_for_time_series(problem_type, X_y_binary): X, y = X_y_binary with warnings.catch_warnings(record=True) as warn: warnings.simplefilter("always") - AutoMLSearch(X, y, problem_type=problem_type, problem_configuration={"date_index": None, "gap": 0, "max_delay": 2}) + AutoMLSearch( + X, + y, + problem_type=problem_type, + problem_configuration={"date_index": None, "gap": 0, "max_delay": 2}, + ) assert len(warn) == 1 message = "Time series support in evalml is still in beta, which means we are still actively building its core features" assert str(warn[0].message).startswith(message) -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.3}) -@patch('evalml.automl.engine.sequential_engine.train_pipeline') +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.3}, +) +@patch("evalml.automl.engine.sequential_engine.train_pipeline") def test_automl_drop_index_columns(mock_train, mock_binary_score, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) - X['index_col'] = pd.Series(range(len(X))) - X.ww.init(index='index_col') + X["index_col"] = pd.Series(range(len(X))) + X.ww.init(index="index_col") - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=2) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_batches=2) automl.search() for pipeline in automl.allowed_pipelines: print(pipeline.parameters) - assert pipeline.get_component('Drop Columns Transformer') - assert 'Drop Columns Transformer' in pipeline.parameters - assert pipeline.parameters['Drop Columns Transformer'] == {'columns': ['index_col']} + assert pipeline.get_component("Drop Columns Transformer") + assert "Drop Columns Transformer" in pipeline.parameters + assert pipeline.parameters["Drop Columns Transformer"] == { + "columns": ["index_col"] + } all_drop_column_params = [] for _, row in automl.full_rankings.iterrows(): if "Baseline" not in row.pipeline_name: - all_drop_column_params.append(row.parameters['Drop Columns Transformer']['columns']) - assert all(param == ['index_col'] for param in all_drop_column_params) + all_drop_column_params.append( + row.parameters["Drop Columns Transformer"]["columns"] + ) + assert all(param == ["index_col"] for param in all_drop_column_params) -def test_automl_validates_data_passed_in_to_allowed_pipelines(X_y_binary, dummy_binary_pipeline_class): +def test_automl_validates_data_passed_in_to_allowed_pipelines( + X_y_binary, dummy_binary_pipeline_class +): X, y = X_y_binary - with pytest.raises(ValueError, match="Parameter allowed_pipelines must be either None or a list!"): - AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=dummy_binary_pipeline_class) - - with pytest.raises(ValueError, match="Every element of allowed_pipelines must an instance of PipelineBase!"): - AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[dummy_binary_pipeline_class]) - - with pytest.raises(ValueError, match="Every element of allowed_pipelines must an instance of PipelineBase!"): - AutoMLSearch(X, y, problem_type="binary", allowed_pipelines=[dummy_binary_pipeline_class.custom_name, dummy_binary_pipeline_class]) - - -@pytest.mark.parametrize("problem_type", [problem_type for problem_type in ProblemTypes.all_problem_types if not is_time_series(problem_type)]) + with pytest.raises( + ValueError, match="Parameter allowed_pipelines must be either None or a list!" + ): + AutoMLSearch( + X, y, problem_type="binary", allowed_pipelines=dummy_binary_pipeline_class + ) + + with pytest.raises( + ValueError, + match="Every element of allowed_pipelines must an instance of PipelineBase!", + ): + AutoMLSearch( + X, y, problem_type="binary", allowed_pipelines=[dummy_binary_pipeline_class] + ) + + with pytest.raises( + ValueError, + match="Every element of allowed_pipelines must an instance of PipelineBase!", + ): + AutoMLSearch( + X, + y, + problem_type="binary", + allowed_pipelines=[ + dummy_binary_pipeline_class.custom_name, + dummy_binary_pipeline_class, + ], + ) + + +@pytest.mark.parametrize( + "problem_type", + [ + problem_type + for problem_type in ProblemTypes.all_problem_types + if not is_time_series(problem_type) + ], +) def test_automl_baseline_pipeline_predictions_and_scores(problem_type): - X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]}) + X = pd.DataFrame({"one": [1, 2, 3, 4], "two": [2, 3, 4, 5], "three": [1, 2, 3, 4]}) y = pd.Series([10, 11, 10, 10]) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([10, 11, 12, 11]) @@ -2798,39 +4406,69 @@ def test_automl_baseline_pipeline_predictions_and_scores(problem_type): if problem_type == ProblemTypes.BINARY: expected_predictions = pd.Series(np.array([10] * len(X)), dtype="int64") - expected_predictions_proba = pd.DataFrame({10: [1., 1., 1., 1.], 11: [0., 0., 0., 0.]}) + expected_predictions_proba = pd.DataFrame( + {10: [1.0, 1.0, 1.0, 1.0], 11: [0.0, 0.0, 0.0, 0.0]} + ) if problem_type == ProblemTypes.MULTICLASS: expected_predictions = pd.Series(np.array([11] * len(X)), dtype="int64") - expected_predictions_proba = pd.DataFrame({10: [0., 0., 0., 0.], 11: [1., 1., 1., 1.], 12: [0., 0., 0., 0.]}) + expected_predictions_proba = pd.DataFrame( + { + 10: [0.0, 0.0, 0.0, 0.0], + 11: [1.0, 1.0, 1.0, 1.0], + 12: [0.0, 0.0, 0.0, 0.0], + } + ) if problem_type == ProblemTypes.REGRESSION: mean = y.mean() expected_predictions = pd.Series([mean] * len(X)) pd.testing.assert_series_equal(expected_predictions, baseline.predict(X)) if is_classification(problem_type): - pd.testing.assert_frame_equal(expected_predictions_proba, baseline.predict_proba(X)) - np.testing.assert_allclose(baseline.feature_importance.iloc[:, 1], np.array([0.0] * X.shape[1])) - - -@pytest.mark.parametrize('gap', [0, 1]) -@pytest.mark.parametrize("problem_type", [problem_type for problem_type in ProblemTypes.all_problem_types if is_time_series(problem_type)]) + pd.testing.assert_frame_equal( + expected_predictions_proba, baseline.predict_proba(X) + ) + np.testing.assert_allclose( + baseline.feature_importance.iloc[:, 1], np.array([0.0] * X.shape[1]) + ) + + +@pytest.mark.parametrize("gap", [0, 1]) +@pytest.mark.parametrize( + "problem_type", + [ + problem_type + for problem_type in ProblemTypes.all_problem_types + if is_time_series(problem_type) + ], +) def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_type, gap): X = pd.DataFrame({"a": [4, 5, 6, 7, 8]}) y = pd.Series([0, 1, 1, 0, 1]) - expected_predictions_proba = pd.DataFrame({0: pd.Series([1, 0, 0, 1, 0], dtype="float64"), - 1: pd.Series([0, 1, 1, 0, 1], dtype="float64")}) + expected_predictions_proba = pd.DataFrame( + { + 0: pd.Series([1, 0, 0, 1, 0], dtype="float64"), + 1: pd.Series([0, 1, 1, 0, 1], dtype="float64"), + } + ) if problem_type == ProblemTypes.TIME_SERIES_MULTICLASS: y = pd.Series([0, 1, 2, 2, 1]) - expected_predictions_proba = pd.DataFrame({0: pd.Series([1, 0, 0, 0, 0], dtype="float64"), - 1: pd.Series([0, 1, 0, 0, 1], dtype="float64"), - 2: pd.Series([0, 0, 1, 1, 0], dtype="float64")}) + expected_predictions_proba = pd.DataFrame( + { + 0: pd.Series([1, 0, 0, 0, 0], dtype="float64"), + 1: pd.Series([0, 1, 0, 0, 1], dtype="float64"), + 2: pd.Series([0, 0, 1, 1, 0], dtype="float64"), + } + ) if gap == 0: # Shift to pad the first row with Nans expected_predictions_proba = expected_predictions_proba.shift(1) - automl = AutoMLSearch(X, y, - problem_type=problem_type, - problem_configuration={"date_index": None, "gap": gap, "max_delay": 1}) + automl = AutoMLSearch( + X, + y, + problem_type=problem_type, + problem_configuration={"date_index": None, "gap": gap, "max_delay": 1}, + ) baseline = automl._get_baseline_pipeline() baseline.fit(X, y) @@ -2841,5 +4479,9 @@ def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_typ pd.testing.assert_series_equal(expected_predictions, baseline.predict(X, y)) if is_classification(problem_type): - pd.testing.assert_frame_equal(expected_predictions_proba, baseline.predict_proba(X, y)) - np.testing.assert_allclose(baseline.feature_importance.iloc[:, 1], np.array([0.0] * X.shape[1])) + pd.testing.assert_frame_equal( + expected_predictions_proba, baseline.predict_proba(X, y) + ) + np.testing.assert_allclose( + baseline.feature_importance.iloc[:, 1], np.array([0.0] * X.shape[1]) + ) diff --git a/evalml/tests/automl_tests/test_automl_algorithm.py b/evalml/tests/automl_tests/test_automl_algorithm.py index 8308c67499..46117c7f26 100644 --- a/evalml/tests/automl_tests/test_automl_algorithm.py +++ b/evalml/tests/automl_tests/test_automl_algorithm.py @@ -14,7 +14,7 @@ def next_batch(self): self._batch_number += 1 if len(self._dummy_pipelines) > 0: return self._dummy_pipelines.pop() - raise StopIteration('No more pipelines!') + raise StopIteration("No more pipelines!") def test_automl_algorithm_dummy(): @@ -22,24 +22,27 @@ def test_automl_algorithm_dummy(): assert algo.pipeline_number == 0 assert algo.batch_number == 0 - algo = DummyAlgorithm(dummy_pipelines=['pipeline 3', 'pipeline 2', 'pipeline 1']) + algo = DummyAlgorithm(dummy_pipelines=["pipeline 3", "pipeline 2", "pipeline 1"]) assert algo.pipeline_number == 0 assert algo.batch_number == 0 - assert algo.next_batch() == 'pipeline 1' + assert algo.next_batch() == "pipeline 1" assert algo.pipeline_number == 1 assert algo.batch_number == 1 - assert algo.next_batch() == 'pipeline 2' + assert algo.next_batch() == "pipeline 2" assert algo.pipeline_number == 2 assert algo.batch_number == 2 - assert algo.next_batch() == 'pipeline 3' + assert algo.next_batch() == "pipeline 3" assert algo.pipeline_number == 3 assert algo.batch_number == 3 - with pytest.raises(StopIteration, match='No more pipelines!'): + with pytest.raises(StopIteration, match="No more pipelines!"): algo.next_batch() def test_automl_algorithm_invalid_pipeline_add(dummy_regression_pipeline_class): algo = DummyAlgorithm() pipeline = dummy_regression_pipeline_class(parameters={}) - with pytest.raises(PipelineNotFoundError, match="No such pipeline allowed in this AutoML search: Mock Regression Pipeline"): + with pytest.raises( + PipelineNotFoundError, + match="No such pipeline allowed in this AutoML search: Mock Regression Pipeline", + ): algo.add_result(0.1234, pipeline, {}) diff --git a/evalml/tests/automl_tests/test_automl_dask.py b/evalml/tests/automl_tests/test_automl_dask.py index ab43d9d5f7..74f49c213c 100644 --- a/evalml/tests/automl_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/test_automl_dask.py @@ -11,16 +11,15 @@ TestPipelineFast, TestPipelineSlow, TestPipelineWithFitError, - TestPipelineWithScoreError + TestPipelineWithScoreError, ) @pytest.mark.usefixtures("X_y_binary_cls") class TestAutoMLSearchDask(unittest.TestCase): - @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): - """ Gives the unittests access to the logger""" + """Gives the unittests access to the logger""" self._caplog = caplog @classmethod @@ -30,13 +29,17 @@ def setUpClass(cls) -> None: cls.sequential_engine = SequentialEngine() def test_automl(self): - """ Comparing the results of parallel and sequential AutoML to each other.""" + """Comparing the results of parallel and sequential AutoML to each other.""" X, y = self.X_y_binary - par_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine) + par_automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine + ) par_automl.search() parallel_rankings = par_automl.full_rankings - seq_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.sequential_engine) + seq_automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", engine=self.sequential_engine + ) seq_automl.search() sequential_rankings = seq_automl.full_rankings @@ -44,54 +47,98 @@ def test_automl(self): seq_results = sequential_rankings.drop(columns=["id"]) assert all(seq_results["pipeline_name"] == par_results["pipeline_name"]) - assert np.allclose(np.array(seq_results["mean_cv_score"]), np.array(par_results["mean_cv_score"])) - assert np.allclose(np.array(seq_results["validation_score"]), np.array(par_results["validation_score"])) - assert np.allclose(np.array(seq_results["percent_better_than_baseline"]), np.array(par_results["percent_better_than_baseline"])) + assert np.allclose( + np.array(seq_results["mean_cv_score"]), + np.array(par_results["mean_cv_score"]), + ) + assert np.allclose( + np.array(seq_results["validation_score"]), + np.array(par_results["validation_score"]), + ) + assert np.allclose( + np.array(seq_results["percent_better_than_baseline"]), + np.array(par_results["percent_better_than_baseline"]), + ) def test_automl_max_iterations(self): - """ Making sure that the max_iterations parameter limits the number of pipelines run. """ + """Making sure that the max_iterations parameter limits the number of pipelines run.""" X, y = self.X_y_binary max_iterations = 4 - par_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine, - max_iterations=max_iterations) + par_automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + engine=self.parallel_engine, + max_iterations=max_iterations, + ) par_automl.search() parallel_rankings = par_automl.full_rankings - seq_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.sequential_engine, - max_iterations=max_iterations) + seq_automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + engine=self.sequential_engine, + max_iterations=max_iterations, + ) seq_automl.search() sequential_rankings = seq_automl.full_rankings assert len(sequential_rankings) == len(parallel_rankings) == max_iterations def test_automl_train_dask_error_callback(self): - """ Make sure the pipeline training error message makes its way back from the workers. """ + """Make sure the pipeline training error message makes its way back from the workers.""" self._caplog.clear() X, y = self.X_y_binary pipelines = [TestPipelineWithFitError({})] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine, - max_iterations=2, allowed_pipelines=pipelines) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + engine=self.parallel_engine, + max_iterations=2, + allowed_pipelines=pipelines, + ) automl.train_pipelines(pipelines) assert "Train error for PipelineWithError: Yikes" in self._caplog.text def test_automl_score_dask_error_callback(self): - """ Make sure the pipeline scoring error message makes its way back from the workers. """ + """Make sure the pipeline scoring error message makes its way back from the workers.""" self._caplog.clear() X, y = self.X_y_binary pipelines = [TestPipelineWithScoreError({})] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine, - max_iterations=2, allowed_pipelines=pipelines) - automl.score_pipelines(pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + engine=self.parallel_engine, + max_iterations=2, + allowed_pipelines=pipelines, + ) + automl.score_pipelines( + pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"] + ) assert "Score error for PipelineWithError" in self._caplog.text def test_automl_immediate_quit(self): - """ Make sure the AutoMLSearch quits when error_callback is defined and does no further work. """ + """Make sure the AutoMLSearch quits when error_callback is defined and does no further work.""" self._caplog.clear() X, y = self.X_y_binary - pipelines = [TestPipelineFast({}), TestPipelineWithFitError({}), TestPipelineSlow({})] - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine, - max_iterations=4, allowed_pipelines=pipelines, error_callback=raise_error_callback, - optimize_thresholds=False) + pipelines = [ + TestPipelineFast({}), + TestPipelineWithFitError({}), + TestPipelineSlow({}), + ] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + engine=self.parallel_engine, + max_iterations=4, + allowed_pipelines=pipelines, + error_callback=raise_error_callback, + optimize_thresholds=False, + ) # Ensure the broken pipeline raises the error with pytest.raises(Exception, match="Yikes"): @@ -99,9 +146,15 @@ def test_automl_immediate_quit(self): # Make sure the automl algorithm stopped after the broken pipeline raised assert len(automl.full_rankings) < len(pipelines) - assert TestPipelineFast.custom_name in set(automl.full_rankings["pipeline_name"]) - assert TestPipelineSlow.custom_name not in set(automl.full_rankings["pipeline_name"]) - assert TestPipelineWithFitError.custom_name not in set(automl.full_rankings["pipeline_name"]) + assert TestPipelineFast.custom_name in set( + automl.full_rankings["pipeline_name"] + ) + assert TestPipelineSlow.custom_name not in set( + automl.full_rankings["pipeline_name"] + ) + assert TestPipelineWithFitError.custom_name not in set( + automl.full_rankings["pipeline_name"] + ) @classmethod def tearDownClass(cls) -> None: diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py index 4f44ba2de4..a1f67067e9 100644 --- a/evalml/tests/automl_tests/test_automl_search_classification.py +++ b/evalml/tests/automl_tests/test_automl_search_classification.py @@ -15,14 +15,14 @@ Precision, PrecisionMicro, Recall, - get_objective + get_objective, ) from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, PipelineBase, TimeSeriesBinaryClassificationPipeline, - TimeSeriesMulticlassClassificationPipeline + TimeSeriesMulticlassClassificationPipeline, ) from evalml.pipelines.components.utils import get_estimators from evalml.pipelines.utils import make_pipeline @@ -33,7 +33,9 @@ def test_init(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_iterations=1, n_jobs=1 + ) automl.search() assert automl.n_jobs == 1 @@ -42,28 +44,42 @@ def test_init(X_y_binary): automl.best_pipeline.predict(X) # test with dataframes - automl = AutoMLSearch(pd.DataFrame(X), pd.Series(y), problem_type='binary', max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + pd.DataFrame(X), pd.Series(y), problem_type="binary", max_iterations=1, n_jobs=1 + ) automl.search() assert isinstance(automl.rankings, pd.DataFrame) assert isinstance(automl.full_rankings, pd.DataFrame) assert isinstance(automl.best_pipeline, PipelineBase) assert isinstance(automl.get_pipeline(0), PipelineBase) - assert automl.objective.name == 'Log Loss Binary' + assert automl.objective.name == "Log Loss Binary" automl.best_pipeline.predict(X) def test_init_objective(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=Precision(), + max_iterations=1, + ) assert isinstance(automl.objective, Precision) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='Precision', max_iterations=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="Precision", + max_iterations=1, + ) assert isinstance(automl.objective, Precision) def test_get_pipeline_none(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") with pytest.raises(PipelineNotFoundError, match="Pipeline not found"): automl.describe_pipeline(0) @@ -71,25 +87,43 @@ def test_get_pipeline_none(X_y_binary): def test_data_splitter(X_y_binary): X, y = X_y_binary cv_folds = 5 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=StratifiedKFold(n_splits=cv_folds), max_iterations=1, - n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + data_splitter=StratifiedKFold(n_splits=cv_folds), + max_iterations=1, + n_jobs=1, + ) automl.search() assert isinstance(automl.rankings, pd.DataFrame) - assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=TimeSeriesSplit(n_splits=cv_folds), - max_iterations=1, n_jobs=1) + assert len(automl.results["pipeline_results"][0]["cv_data"]) == cv_folds + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + data_splitter=TimeSeriesSplit(n_splits=cv_folds), + max_iterations=1, + n_jobs=1, + ) automl.search() assert isinstance(automl.rankings, pd.DataFrame) - assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds + assert len(automl.results["pipeline_results"][0]["cv_data"]) == cv_folds def test_max_iterations(X_y_binary): X, y = X_y_binary max_iterations = 5 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=max_iterations, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=max_iterations, + n_jobs=1, + ) automl.search() assert len(automl.full_rankings) == max_iterations @@ -97,22 +131,42 @@ def test_max_iterations(X_y_binary): def test_recall_error(X_y_binary): X, y = X_y_binary # Recall is a valid objective but it's not allowed in AutoML so a ValueError is expected - error_msg = 'recall is not allowed in AutoML!' + error_msg = "recall is not allowed in AutoML!" with pytest.raises(ValueError, match=error_msg): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='recall', max_iterations=1) + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="recall", + max_iterations=1, + ) def test_recall_object(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Recall(), max_iterations=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=Recall(), + max_iterations=1, + n_jobs=1, + ) automl.search() assert len(automl.full_rankings) > 0 - assert automl.objective.name == 'Recall' + assert automl.objective.name == "Recall" def test_binary_auto(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="Log Loss Binary", max_iterations=5, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="Log Loss Binary", + max_iterations=5, + n_jobs=1, + ) automl.search() best_pipeline = automl.best_pipeline @@ -124,44 +178,68 @@ def test_binary_auto(X_y_binary): def test_multi_auto(X_y_multi, multiclass_core_objectives): X, y = X_y_multi objective = PrecisionMicro() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective=objective, max_iterations=5, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + objective=objective, + max_iterations=5, + n_jobs=1, + ) automl.search() best_pipeline = automl.best_pipeline assert best_pipeline._is_fitted y_pred = best_pipeline.predict(X) assert len(np.unique(y_pred)) == 3 - objective_in_additional_objectives = next((obj for obj in multiclass_core_objectives if obj.name == objective.name), None) + objective_in_additional_objectives = next( + (obj for obj in multiclass_core_objectives if obj.name == objective.name), None + ) multiclass_core_objectives.remove(objective_in_additional_objectives) - for expected, additional in zip(multiclass_core_objectives, automl.additional_objectives): + for expected, additional in zip( + multiclass_core_objectives, automl.additional_objectives + ): assert type(additional) is type(expected) def test_multi_objective(X_y_multi): X, y = X_y_multi - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="Log Loss Binary") + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", objective="Log Loss Binary" + ) assert automl.problem_type == ProblemTypes.BINARY - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective="Log Loss Multiclass") + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="multiclass", objective="Log Loss Multiclass" + ) assert automl.problem_type == ProblemTypes.MULTICLASS - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective='AUC Micro') + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="multiclass", objective="AUC Micro" + ) assert automl.problem_type == ProblemTypes.MULTICLASS - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", objective="AUC") assert automl.problem_type == ProblemTypes.BINARY - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass") assert automl.problem_type == ProblemTypes.MULTICLASS - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") assert automl.problem_type == ProblemTypes.BINARY def test_categorical_classification(X_y_categorical_classification): X, y = X_y_categorical_classification - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="precision", max_iterations=5, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="precision", + max_iterations=5, + n_jobs=1, + ) automl.search() assert not automl.rankings["mean_cv_score"].isnull().all() @@ -169,10 +247,26 @@ def test_categorical_classification(X_y_categorical_classification): def test_random_seed(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=5, random_seed=0, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=Precision(), + max_iterations=5, + random_seed=0, + n_jobs=1, + ) automl.search() - automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=5, random_seed=0, n_jobs=1) + automl_1 = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=Precision(), + max_iterations=5, + random_seed=0, + n_jobs=1, + ) automl_1.search() assert automl.rankings.equals(automl_1.rankings) @@ -192,65 +286,130 @@ def add_result_callback(results, trained_pipeline, automl_obj, counts=counts): counts["add_result_callback"] += 1 max_iterations = 3 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=max_iterations, - start_iteration_callback=start_iteration_callback, - add_result_callback=add_result_callback, - n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=Precision(), + max_iterations=max_iterations, + start_iteration_callback=start_iteration_callback, + add_result_callback=add_result_callback, + n_jobs=1, + ) automl.search() - assert counts["start_iteration_callback"] == len(get_estimators('binary')) + 1 + assert counts["start_iteration_callback"] == len(get_estimators("binary")) + 1 assert counts["add_result_callback"] == max_iterations def test_additional_objectives(X_y_binary): X, y = X_y_binary - objective = FraudCost(retry_percentage=.5, - interchange_fee=.02, - fraud_payout_percentage=.75, - amount_col=10) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_iterations=2, additional_objectives=[objective], - n_jobs=1) + objective = FraudCost( + retry_percentage=0.5, + interchange_fee=0.02, + fraud_payout_percentage=0.75, + amount_col=10, + ) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="F1", + max_iterations=2, + additional_objectives=[objective], + n_jobs=1, + ) automl.search() results = automl.describe_pipeline(0, return_dict=True) - assert 'Fraud Cost' in list(results["cv_data"][0]["all_objective_scores"].keys()) + assert "Fraud Cost" in list(results["cv_data"][0]["all_objective_scores"].keys()) -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets', side_effect=lambda y: y) -@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_optimizable_threshold_enabled(mock_fit, mock_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, X_y_binary, caplog): +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") +@patch( + "evalml.pipelines.BinaryClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_optimizable_threshold_enabled( + mock_fit, + mock_score, + mock_predict_proba, + mock_encode_targets, + mock_optimize_threshold, + X_y_binary, + caplog, +): mock_optimize_threshold.return_value = 0.8 X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='precision', max_iterations=1, optimize_thresholds=True) - mock_score.return_value = {'precision': 1.0} + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="precision", + max_iterations=1, + optimize_thresholds=True, + ) + mock_score.return_value = {"precision": 1.0} automl.search() mock_fit.assert_called() mock_score.assert_called() mock_predict_proba.assert_called() mock_optimize_threshold.assert_called() assert automl.best_pipeline.threshold == 0.8 - assert automl.results['pipeline_results'][0]['cv_data'][0].get('binary_classification_threshold') == 0.8 - assert automl.results['pipeline_results'][0]['cv_data'][1].get('binary_classification_threshold') == 0.8 - assert automl.results['pipeline_results'][0]['cv_data'][2].get('binary_classification_threshold') == 0.8 + assert ( + automl.results["pipeline_results"][0]["cv_data"][0].get( + "binary_classification_threshold" + ) + == 0.8 + ) + assert ( + automl.results["pipeline_results"][0]["cv_data"][1].get( + "binary_classification_threshold" + ) + == 0.8 + ) + assert ( + automl.results["pipeline_results"][0]["cv_data"][2].get( + "binary_classification_threshold" + ) + == 0.8 + ) automl.describe_pipeline(0) out = caplog.text assert "Objective to optimize binary classification pipeline thresholds for" in out -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets', side_effect=lambda y: y) -@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_optimizable_threshold_disabled(mock_fit, mock_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, X_y_binary): +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") +@patch( + "evalml.pipelines.BinaryClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_optimizable_threshold_disabled( + mock_fit, + mock_score, + mock_predict_proba, + mock_encode_targets, + mock_optimize_threshold, + X_y_binary, +): mock_optimize_threshold.return_value = 0.8 X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='precision', max_iterations=1, optimize_thresholds=False) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="precision", + max_iterations=1, + optimize_thresholds=False, + ) mock_score.return_value = {automl.objective.name: 1.0} automl.search() mock_fit.assert_called() @@ -258,29 +417,73 @@ def test_optimizable_threshold_disabled(mock_fit, mock_score, mock_predict_proba assert not mock_predict_proba.called assert not mock_optimize_threshold.called assert automl.best_pipeline.threshold == 0.5 - assert automl.results['pipeline_results'][0]['cv_data'][0].get('binary_classification_threshold') == 0.5 - assert automl.results['pipeline_results'][0]['cv_data'][1].get('binary_classification_threshold') == 0.5 - assert automl.results['pipeline_results'][0]['cv_data'][2].get('binary_classification_threshold') == 0.5 - - -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') + assert ( + automl.results["pipeline_results"][0]["cv_data"][0].get( + "binary_classification_threshold" + ) + == 0.5 + ) + assert ( + automl.results["pipeline_results"][0]["cv_data"][1].get( + "binary_classification_threshold" + ) + == 0.5 + ) + assert ( + automl.results["pipeline_results"][0]["cv_data"][2].get( + "binary_classification_threshold" + ) + == 0.5 + ) + + +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_non_optimizable_threshold(mock_fit, mock_score, X_y_binary): mock_score.return_value = {"AUC": 1.0} X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', optimize_thresholds=False, max_iterations=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="AUC", + optimize_thresholds=False, + max_iterations=1, + ) automl.search() mock_fit.assert_called() mock_score.assert_called() assert automl.best_pipeline.threshold is None - assert automl.results['pipeline_results'][0]['cv_data'][0].get('binary_classification_threshold') is None - assert automl.results['pipeline_results'][0]['cv_data'][1].get('binary_classification_threshold') is None - assert automl.results['pipeline_results'][0]['cv_data'][2].get('binary_classification_threshold') is None + assert ( + automl.results["pipeline_results"][0]["cv_data"][0].get( + "binary_classification_threshold" + ) + is None + ) + assert ( + automl.results["pipeline_results"][0]["cv_data"][1].get( + "binary_classification_threshold" + ) + is None + ) + assert ( + automl.results["pipeline_results"][0]["cv_data"][2].get( + "binary_classification_threshold" + ) + is None + ) def test_describe_pipeline_objective_ordered(X_y_binary, caplog): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=2, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="AUC", + max_iterations=2, + n_jobs=1, + ) automl.search() automl.describe_pipeline(0) @@ -296,29 +499,59 @@ def test_describe_pipeline_objective_ordered(X_y_binary, caplog): def test_max_time_units(X_y_binary): X, y = X_y_binary - str_max_time = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_time='60 seconds') + str_max_time = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="F1", + max_time="60 seconds", + ) assert str_max_time.max_time == 60 - hour_max_time = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_time='1 hour') + hour_max_time = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", objective="F1", max_time="1 hour" + ) assert hour_max_time.max_time == 3600 - min_max_time = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_time='30 mins') + min_max_time = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", objective="F1", max_time="30 mins" + ) assert min_max_time.max_time == 1800 - min_max_time = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_time='30 s') + min_max_time = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", objective="F1", max_time="30 s" + ) assert min_max_time.max_time == 30 - with pytest.raises(AssertionError, match="Invalid unit. Units must be hours, mins, or seconds. Received 'year'"): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_time='30 years') - - with pytest.raises(TypeError, match="Parameter max_time must be a float, int, string or None. Received with value \\(30, 'minutes'\\)."): - AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_time=(30, 'minutes')) + with pytest.raises( + AssertionError, + match="Invalid unit. Units must be hours, mins, or seconds. Received 'year'", + ): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="F1", + max_time="30 years", + ) + + with pytest.raises( + TypeError, + match="Parameter max_time must be a float, int, string or None. Received with value \\(30, 'minutes'\\).", + ): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="F1", + max_time=(30, "minutes"), + ) def test_plot_disabled_missing_dependency(X_y_binary, has_minimal_dependencies): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=3) + automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_iterations=3) if has_minimal_dependencies: with pytest.raises(AttributeError): automl.plot.search_iteration_plot @@ -327,15 +560,25 @@ def test_plot_disabled_missing_dependency(X_y_binary, has_minimal_dependencies): def test_plot_iterations_max_iterations(X_y_binary): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="f1", + max_iterations=3, + n_jobs=1, + ) automl.search() plot = automl.plot.search_iteration_plot() plot_data = plot.data[0] - x = pd.Series(plot_data['x']) - y = pd.Series(plot_data['y']) + x = pd.Series(plot_data["x"]) + y = pd.Series(plot_data["y"]) assert isinstance(plot, go.Figure) assert x.is_monotonic_increasing @@ -345,15 +588,25 @@ def test_plot_iterations_max_iterations(X_y_binary): def test_plot_iterations_max_time(X_y_binary): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_time=10, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="f1", + max_time=10, + n_jobs=1, + ) automl.search(show_iteration_plot=False) plot = automl.plot.search_iteration_plot() plot_data = plot.data[0] - x = pd.Series(plot_data['x']) - y = pd.Series(plot_data['y']) + x = pd.Series(plot_data["x"]) + y = pd.Series(plot_data["y"]) assert isinstance(plot, go.Figure) assert x.is_monotonic_increasing @@ -362,13 +615,26 @@ def test_plot_iterations_max_time(X_y_binary): assert len(y) > 0 -@patch('IPython.display.display') +@patch("IPython.display.display") def test_plot_iterations_ipython_mock(mock_ipython_display, X_y_binary): - pytest.importorskip('IPython.display', reason='Skipping plotting test because ipywidgets not installed') - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + pytest.importorskip( + "IPython.display", + reason="Skipping plotting test because ipywidgets not installed", + ) + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="f1", + max_iterations=3, + n_jobs=1, + ) automl.search() plot = automl.plot.search_iteration_plot(interactive_plot=True) assert isinstance(plot, SearchIterationPlot) @@ -376,24 +642,37 @@ def test_plot_iterations_ipython_mock(mock_ipython_display, X_y_binary): mock_ipython_display.assert_called_with(plot.best_score_by_iter_fig) -@patch('IPython.display.display') +@patch("IPython.display.display") def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y_binary): - pytest.importorskip('IPython.display', reason='Skipping plotting test because ipywidgets not installed') - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + pytest.importorskip( + "IPython.display", + reason="Skipping plotting test because ipywidgets not installed", + ) + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective="f1", + max_iterations=3, + n_jobs=1, + ) automl.search() - mock_ipython_display.side_effect = ImportError('KABOOOOOOMMMM') + mock_ipython_display.side_effect = ImportError("KABOOOOOOMMMM") plot = automl.plot.search_iteration_plot(interactive_plot=True) mock_ipython_display.assert_called_once() assert isinstance(plot, go.Figure) assert isinstance(plot.data, tuple) plot_data = plot.data[0] - x = pd.Series(plot_data['x']) - y = pd.Series(plot_data['y']) + x = pd.Series(plot_data["x"]) + y = pd.Series(plot_data["y"]) assert x.is_monotonic_increasing assert y.is_monotonic_increasing assert len(x) == 3 @@ -402,27 +681,44 @@ def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y_b def test_max_time(X_y_binary): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time=1e-16, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", max_time=1e-16, n_jobs=1 + ) automl.search() # search will always run at least one pipeline - assert len(automl.results['pipeline_results']) == 1 + assert len(automl.results["pipeline_results"]) == 1 @pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -def test_automl_allowed_pipelines_no_allowed_pipelines(automl_type, X_y_binary, X_y_multi): +def test_automl_allowed_pipelines_no_allowed_pipelines( + automl_type, X_y_binary, X_y_multi +): is_multiclass = automl_type == ProblemTypes.MULTICLASS X, y = X_y_multi if is_multiclass else X_y_binary - problem_type = 'multiclass' if is_multiclass else 'binary' + problem_type = "multiclass" if is_multiclass else "binary" with pytest.raises(ValueError, match="No allowed pipelines to search"): - AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, allowed_pipelines=None, allowed_model_families=[]) - - -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_allowed_pipelines_specified_allowed_pipelines_binary(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + allowed_pipelines=None, + allowed_model_families=[], + ) + + +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_allowed_pipelines_specified_allowed_pipelines_binary( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', - allowed_pipelines=[dummy_binary_pipeline_class({})], allowed_model_families=None) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_model_families=None, + ) expected_pipelines = [dummy_binary_pipeline_class({})] mock_score.return_value = {automl.objective.name: 1.0} assert automl.allowed_pipelines == expected_pipelines @@ -435,12 +731,19 @@ def test_automl_allowed_pipelines_specified_allowed_pipelines_binary(mock_fit, m assert automl.allowed_model_families == [ModelFamily.NONE] -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -def test_automl_allowed_pipelines_specified_allowed_pipelines_multi(mock_fit, mock_score, dummy_multiclass_pipeline_class, X_y_multi): +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +def test_automl_allowed_pipelines_specified_allowed_pipelines_multi( + mock_fit, mock_score, dummy_multiclass_pipeline_class, X_y_multi +): X, y = X_y_multi - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', - allowed_pipelines=[dummy_multiclass_pipeline_class({})], allowed_model_families=None) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + allowed_pipelines=[dummy_multiclass_pipeline_class({})], + allowed_model_families=None, + ) expected_pipelines = [dummy_multiclass_pipeline_class({})] mock_score.return_value = {automl.objective.name: 1.0} assert automl.allowed_pipelines == expected_pipelines @@ -453,13 +756,26 @@ def test_automl_allowed_pipelines_specified_allowed_pipelines_multi(mock_fit, mo assert automl.allowed_model_families == [ModelFamily.NONE] -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_allowed_pipelines_specified_allowed_model_families_binary(mock_fit, mock_score, X_y_binary, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_allowed_pipelines_specified_allowed_model_families_binary( + mock_fit, mock_score, X_y_binary, assert_allowed_pipelines_equal_helper +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_pipelines=None, allowed_model_families=[ModelFamily.RANDOM_FOREST]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_pipelines=None, + allowed_model_families=[ModelFamily.RANDOM_FOREST], + ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.BINARY) for estimator in get_estimators(ProblemTypes.BINARY, model_families=[ModelFamily.RANDOM_FOREST])] + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.BINARY) + for estimator in get_estimators( + ProblemTypes.BINARY, model_families=[ModelFamily.RANDOM_FOREST] + ) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() @@ -470,8 +786,19 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_binary(mock_f mock_fit.reset_mock() mock_score.reset_mock() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_pipelines=None, allowed_model_families=['random_forest']) - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.BINARY) for estimator in get_estimators(ProblemTypes.BINARY, model_families=[ModelFamily.RANDOM_FOREST])] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_pipelines=None, + allowed_model_families=["random_forest"], + ) + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.BINARY) + for estimator in get_estimators( + ProblemTypes.BINARY, model_families=[ModelFamily.RANDOM_FOREST] + ) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) @@ -480,13 +807,26 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_binary(mock_f mock_score.assert_called() -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -def test_automl_allowed_pipelines_specified_allowed_model_families_multi(mock_fit, mock_score, X_y_multi, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +def test_automl_allowed_pipelines_specified_allowed_model_families_multi( + mock_fit, mock_score, X_y_multi, assert_allowed_pipelines_equal_helper +): X, y = X_y_multi - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=None, allowed_model_families=[ModelFamily.RANDOM_FOREST]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + allowed_pipelines=None, + allowed_model_families=[ModelFamily.RANDOM_FOREST], + ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS) for estimator in get_estimators(ProblemTypes.MULTICLASS, model_families=[ModelFamily.RANDOM_FOREST])] + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS) + for estimator in get_estimators( + ProblemTypes.MULTICLASS, model_families=[ModelFamily.RANDOM_FOREST] + ) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() @@ -497,8 +837,19 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_multi(mock_fi mock_fit.reset_mock() mock_score.reset_mock() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=None, allowed_model_families=['random_forest']) - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS) for estimator in get_estimators(ProblemTypes.MULTICLASS, model_families=[ModelFamily.RANDOM_FOREST])] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + allowed_pipelines=None, + allowed_model_families=["random_forest"], + ) + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS) + for estimator in get_estimators( + ProblemTypes.MULTICLASS, model_families=[ModelFamily.RANDOM_FOREST] + ) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) @@ -507,43 +858,81 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_multi(mock_fi mock_score.assert_called() -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_allowed_pipelines_init_allowed_both_not_specified_binary(mock_fit, mock_score, X_y_binary, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_allowed_pipelines_init_allowed_both_not_specified_binary( + mock_fit, mock_score, X_y_binary, assert_allowed_pipelines_equal_helper +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_pipelines=None, allowed_model_families=None) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_pipelines=None, + allowed_model_families=None, + ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.BINARY) for estimator in get_estimators(ProblemTypes.BINARY, model_families=None)] + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.BINARY) + for estimator in get_estimators(ProblemTypes.BINARY, model_families=None) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) - assert set(automl.allowed_model_families) == set([p.model_family for p in expected_pipelines]) + assert set(automl.allowed_model_families) == set( + [p.model_family for p in expected_pipelines] + ) mock_fit.assert_called() mock_score.assert_called() -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -def test_automl_allowed_pipelines_init_allowed_both_not_specified_multi(mock_fit, mock_score, X_y_multi, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +def test_automl_allowed_pipelines_init_allowed_both_not_specified_multi( + mock_fit, mock_score, X_y_multi, assert_allowed_pipelines_equal_helper +): X, y = X_y_multi - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=None, allowed_model_families=None) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + allowed_pipelines=None, + allowed_model_families=None, + ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS) for estimator in get_estimators(ProblemTypes.MULTICLASS, model_families=None)] + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS) + for estimator in get_estimators(ProblemTypes.MULTICLASS, model_families=None) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) - assert set(automl.allowed_model_families) == set([p.model_family for p in expected_pipelines]) + assert set(automl.allowed_model_families) == set( + [p.model_family for p in expected_pipelines] + ) mock_fit.assert_called() mock_score.assert_called() -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_allowed_pipelines_init_allowed_both_specified_binary(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_allowed_pipelines_init_allowed_both_specified_binary( + mock_fit, + mock_score, + dummy_binary_pipeline_class, + X_y_binary, + assert_allowed_pipelines_equal_helper, +): X, y = X_y_binary - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_pipelines=[dummy_binary_pipeline_class({})], allowed_model_families=[ModelFamily.RANDOM_FOREST]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_model_families=[ModelFamily.RANDOM_FOREST], + ) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [dummy_binary_pipeline_class({})] assert automl.allowed_pipelines == expected_pipelines @@ -552,16 +941,30 @@ def test_automl_allowed_pipelines_init_allowed_both_specified_binary(mock_fit, m automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) - assert set(automl.allowed_model_families) == set([p.model_family for p in expected_pipelines]) + assert set(automl.allowed_model_families) == set( + [p.model_family for p in expected_pipelines] + ) mock_fit.assert_called() mock_score.assert_called() -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -def test_automl_allowed_pipelines_init_allowed_both_specified_multi(mock_fit, mock_score, dummy_multiclass_pipeline_class, X_y_multi, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +def test_automl_allowed_pipelines_init_allowed_both_specified_multi( + mock_fit, + mock_score, + dummy_multiclass_pipeline_class, + X_y_multi, + assert_allowed_pipelines_equal_helper, +): X, y = X_y_multi - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=[dummy_multiclass_pipeline_class({})], allowed_model_families=[ModelFamily.RANDOM_FOREST]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiclass", + allowed_pipelines=[dummy_multiclass_pipeline_class({})], + allowed_model_families=[ModelFamily.RANDOM_FOREST], + ) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [dummy_multiclass_pipeline_class({})] assert automl.allowed_pipelines == expected_pipelines @@ -570,135 +973,237 @@ def test_automl_allowed_pipelines_init_allowed_both_specified_multi(mock_fit, mo automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) - assert set(automl.allowed_model_families) == set([p.model_family for p in expected_pipelines]) + assert set(automl.allowed_model_families) == set( + [p.model_family for p in expected_pipelines] + ) mock_fit.assert_called() mock_score.assert_called() -@pytest.mark.parametrize('is_linear', [True, False]) -@pytest.mark.parametrize('problem_type', [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -@patch('evalml.pipelines.MulticlassClassificationPipeline.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_automl_allowed_pipelines_search(mock_binary_fit, mock_binary_score, - mock_multi_fit, mock_multi_score, - is_linear, problem_type, - dummy_binary_pipeline_class, nonlinear_binary_pipeline_class, - dummy_multiclass_pipeline_class, nonlinear_multiclass_pipeline_class, - X_y_binary, X_y_multi): +@pytest.mark.parametrize("is_linear", [True, False]) +@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) +@patch("evalml.pipelines.MulticlassClassificationPipeline.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_automl_allowed_pipelines_search( + mock_binary_fit, + mock_binary_score, + mock_multi_fit, + mock_multi_score, + is_linear, + problem_type, + dummy_binary_pipeline_class, + nonlinear_binary_pipeline_class, + dummy_multiclass_pipeline_class, + nonlinear_multiclass_pipeline_class, + X_y_binary, + X_y_multi, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - mock_binary_score.return_value = {'Log Loss Binary': 1.0} + mock_binary_score.return_value = {"Log Loss Binary": 1.0} expected_mock_class = BinaryClassificationPipeline - pipeline_class = dummy_binary_pipeline_class if is_linear else nonlinear_binary_pipeline_class + pipeline_class = ( + dummy_binary_pipeline_class + if is_linear + else nonlinear_binary_pipeline_class + ) else: X, y = X_y_multi - mock_multi_score.return_value = {'Log Loss Multiclass': 1.0} + mock_multi_score.return_value = {"Log Loss Multiclass": 1.0} expected_mock_class = MulticlassClassificationPipeline - pipeline_class = dummy_multiclass_pipeline_class if is_linear else nonlinear_multiclass_pipeline_class + pipeline_class = ( + dummy_multiclass_pipeline_class + if is_linear + else nonlinear_multiclass_pipeline_class + ) allowed_pipelines = [pipeline_class({})] start_iteration_callback = MagicMock() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, - max_iterations=5, start_iteration_callback=start_iteration_callback, - allowed_pipelines=allowed_pipelines) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + max_iterations=5, + start_iteration_callback=start_iteration_callback, + allowed_pipelines=allowed_pipelines, + ) automl.search() - assert isinstance(start_iteration_callback.call_args_list[0][0][0], expected_mock_class) + assert isinstance( + start_iteration_callback.call_args_list[0][0][0], expected_mock_class + ) for i in range(1, 5): - assert isinstance(start_iteration_callback.call_args_list[i][0][0], pipeline_class) + assert isinstance( + start_iteration_callback.call_args_list[i][0][0], pipeline_class + ) -@pytest.mark.parametrize('problem_type', [ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_BINARY]) -@patch('evalml.pipelines.TimeSeriesMulticlassClassificationPipeline.score') -@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline.score') -@patch('evalml.pipelines.TimeSeriesMulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline.fit') -def test_automl_supports_time_series_classification(mock_binary_fit, mock_multi_fit, mock_binary_score, mock_multiclass_score, - problem_type, X_y_binary, X_y_multi): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_BINARY], +) +@patch("evalml.pipelines.TimeSeriesMulticlassClassificationPipeline.score") +@patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline.score") +@patch("evalml.pipelines.TimeSeriesMulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline.fit") +def test_automl_supports_time_series_classification( + mock_binary_fit, + mock_multi_fit, + mock_binary_score, + mock_multiclass_score, + problem_type, + X_y_binary, + X_y_multi, +): if problem_type == ProblemTypes.TIME_SERIES_BINARY: X, y = X_y_binary - baseline = TimeSeriesBinaryClassificationPipeline(component_graph=["Time Series Baseline Estimator"], - parameters={'Time Series Baseline Estimator': {"date_index": None, 'gap': 0, 'max_delay': 0}, - 'pipeline': {"date_index": None, 'gap': 0, 'max_delay': 0}}) + baseline = TimeSeriesBinaryClassificationPipeline( + component_graph=["Time Series Baseline Estimator"], + parameters={ + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 0, + "max_delay": 0, + }, + "pipeline": {"date_index": None, "gap": 0, "max_delay": 0}, + }, + ) mock_binary_score.return_value = {"Log Loss Binary": 0.2} - problem_type = 'time series binary' + problem_type = "time series binary" else: X, y = X_y_multi - baseline = TimeSeriesMulticlassClassificationPipeline(component_graph=["Time Series Baseline Estimator"], - parameters={'Time Series Baseline Estimator': {"date_index": None, 'gap': 0, 'max_delay': 0}, - 'pipeline': {"date_index": None, 'gap': 0, 'max_delay': 0}}) + baseline = TimeSeriesMulticlassClassificationPipeline( + component_graph=["Time Series Baseline Estimator"], + parameters={ + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 0, + "max_delay": 0, + }, + "pipeline": {"date_index": None, "gap": 0, "max_delay": 0}, + }, + ) mock_multiclass_score.return_value = {"Log Loss Multiclass": 0.25} - problem_type = 'time series multiclass' - - configuration = {"date_index": None, "gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True} + problem_type = "time series multiclass" + + configuration = { + "date_index": None, + "gap": 0, + "max_delay": 0, + "delay_target": False, + "delay_features": True, + } - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, - problem_configuration=configuration, - max_batches=2) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + problem_configuration=configuration, + max_batches=2, + ) automl.search() assert isinstance(automl.data_splitter, TimeSeriesSplit) - for result in automl.results['pipeline_results'].values(): + for result in automl.results["pipeline_results"].values(): if result["id"] == 0: - assert result['pipeline_class'] == baseline.__class__ + assert result["pipeline_class"] == baseline.__class__ continue - assert result['parameters']['Delayed Feature Transformer'] == configuration - assert result['parameters']['pipeline'] == configuration + assert result["parameters"]["Delayed Feature Transformer"] == configuration + assert result["parameters"]["pipeline"] == configuration -@pytest.mark.parametrize("objective", ['F1', 'Log Loss Binary']) +@pytest.mark.parametrize("objective", ["F1", "Log Loss Binary"]) @pytest.mark.parametrize("optimize", [True, False]) -@patch('evalml.automl.engine.engine_base.split_data') -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') -@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline._encode_targets', side_effect=lambda y: y) -@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline.predict_proba') -@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline.score') -@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline.fit') -def test_automl_time_series_classification_threshold(mock_binary_fit, mock_binary_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, mock_split_data, - optimize, objective, X_y_binary): +@patch("evalml.automl.engine.engine_base.split_data") +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") +@patch( + "evalml.pipelines.TimeSeriesBinaryClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +@patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline.predict_proba") +@patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline.score") +@patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline.fit") +def test_automl_time_series_classification_threshold( + mock_binary_fit, + mock_binary_score, + mock_predict_proba, + mock_encode_targets, + mock_optimize_threshold, + mock_split_data, + optimize, + objective, + X_y_binary, +): X, y = X_y_binary mock_binary_score.return_value = {objective: 0.4} - problem_type = 'time series binary' - - configuration = {"date_index": None, "gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True} + problem_type = "time series binary" + + configuration = { + "date_index": None, + "gap": 0, + "max_delay": 0, + "delay_target": False, + "delay_features": True, + } mock_optimize_threshold.return_value = 0.62 - mock_split_data.return_value = split_data(X, y, problem_type, test_size=0.2, random_seed=0) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, - problem_configuration=configuration, objective=objective, optimize_thresholds=optimize, - max_batches=2) + mock_split_data.return_value = split_data( + X, y, problem_type, test_size=0.2, random_seed=0 + ) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + problem_configuration=configuration, + objective=objective, + optimize_thresholds=optimize, + max_batches=2, + ) automl.search() assert isinstance(automl.data_splitter, TimeSeriesSplit) - if objective == 'Log Loss Binary': + if objective == "Log Loss Binary": mock_optimize_threshold.assert_not_called() assert automl.best_pipeline.threshold is None mock_split_data.assert_not_called() - elif optimize and objective == 'F1': + elif optimize and objective == "F1": mock_optimize_threshold.assert_called() assert automl.best_pipeline.threshold == 0.62 mock_split_data.assert_called() assert str(mock_split_data.call_args[0][2]) == problem_type - elif not optimize and objective == 'F1': + elif not optimize and objective == "F1": mock_optimize_threshold.assert_not_called() assert automl.best_pipeline.threshold == 0.5 mock_split_data.assert_not_called() -@pytest.mark.parametrize("objective", ['F1', 'Log Loss Binary', 'AUC']) -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets', side_effect=lambda y: y) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba') -def test_tuning_threshold_objective(mock_predict, mock_fit, mock_score, mock_encode_targets, mock_optimize_threshold, objective, X_y_binary): +@pytest.mark.parametrize("objective", ["F1", "Log Loss Binary", "AUC"]) +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") +@patch( + "evalml.pipelines.BinaryClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba") +def test_tuning_threshold_objective( + mock_predict, + mock_fit, + mock_score, + mock_encode_targets, + mock_optimize_threshold, + objective, + X_y_binary, +): mock_optimize_threshold.return_value = 0.6 X, y = X_y_binary mock_score.return_value = {objective: 0.5} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=objective) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="binary", objective=objective + ) automl.search() if objective != "F1": @@ -707,93 +1212,187 @@ def test_tuning_threshold_objective(mock_predict, mock_fit, mock_score, mock_enc assert automl.best_pipeline.threshold == 0.6 -@pytest.mark.parametrize("problem_type", ['binary', 'multiclass']) -@pytest.mark.parametrize("categorical_features", ['none', 'some', 'all']) -@pytest.mark.parametrize("size", ['small', 'large']) +@pytest.mark.parametrize("problem_type", ["binary", "multiclass"]) +@pytest.mark.parametrize("categorical_features", ["none", "some", "all"]) +@pytest.mark.parametrize("size", ["small", "large"]) @pytest.mark.parametrize("sampling_ratio", [0.8, 0.5, 0.25, 0.2, 0.1, 0.05]) -def test_automl_search_sampler_ratio(sampling_ratio, size, categorical_features, problem_type, mock_imbalanced_data_X_y, has_minimal_dependencies): +def test_automl_search_sampler_ratio( + sampling_ratio, + size, + categorical_features, + problem_type, + mock_imbalanced_data_X_y, + has_minimal_dependencies, +): X, y = mock_imbalanced_data_X_y(problem_type, categorical_features, size) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, sampler_method='auto', sampler_balanced_ratio=sampling_ratio) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + sampler_method="auto", + sampler_balanced_ratio=sampling_ratio, + ) pipelines = automl.allowed_pipelines if sampling_ratio <= 0.2: # we consider this balanced, so we expect no samplers - assert not any(any("sampler" in comp.name for comp in pipeline.component_graph) for pipeline in pipelines) + assert not any( + any("sampler" in comp.name for comp in pipeline.component_graph) + for pipeline in pipelines + ) else: - if size == 'large' or has_minimal_dependencies: - assert all(any("Undersampler" in comp.name for comp in pipeline.component_graph) for pipeline in pipelines) - elif categorical_features == 'none': - assert all(any("SMOTE Oversampler" in comp.name for comp in pipeline.component_graph) for pipeline in pipelines) - elif categorical_features == 'some': - assert all(any("SMOTENC Oversampler" in comp.name for comp in pipeline.component_graph) for pipeline in pipelines) - elif categorical_features == 'all': - assert all(any("SMOTEN Oversampler" in comp.name for comp in pipeline.component_graph) for pipeline in pipelines) + if size == "large" or has_minimal_dependencies: + assert all( + any("Undersampler" in comp.name for comp in pipeline.component_graph) + for pipeline in pipelines + ) + elif categorical_features == "none": + assert all( + any( + "SMOTE Oversampler" in comp.name + for comp in pipeline.component_graph + ) + for pipeline in pipelines + ) + elif categorical_features == "some": + assert all( + any( + "SMOTENC Oversampler" in comp.name + for comp in pipeline.component_graph + ) + for pipeline in pipelines + ) + elif categorical_features == "all": + assert all( + any( + "SMOTEN Oversampler" in comp.name + for comp in pipeline.component_graph + ) + for pipeline in pipelines + ) for comp in pipelines[0]._component_graph: - if 'sampler' in comp.name: - assert comp.parameters['sampling_ratio'] == sampling_ratio - - -@pytest.mark.parametrize("problem_type", ['binary', 'multiclass']) -@pytest.mark.parametrize("sampler_method,categorical_features", [(None, 'none'), (None, 'some'), (None, 'all'), - ('Undersampler', 'none'), ('Undersampler', 'some'), ('Undersampler', 'all'), - ('Oversampler', 'none'), ('Oversampler', 'some'), ('Oversampler', 'all')]) -def test_automl_search_sampler_method(sampler_method, categorical_features, problem_type, mock_imbalanced_data_X_y, has_minimal_dependencies, caplog): + if "sampler" in comp.name: + assert comp.parameters["sampling_ratio"] == sampling_ratio + + +@pytest.mark.parametrize("problem_type", ["binary", "multiclass"]) +@pytest.mark.parametrize( + "sampler_method,categorical_features", + [ + (None, "none"), + (None, "some"), + (None, "all"), + ("Undersampler", "none"), + ("Undersampler", "some"), + ("Undersampler", "all"), + ("Oversampler", "none"), + ("Oversampler", "some"), + ("Oversampler", "all"), + ], +) +def test_automl_search_sampler_method( + sampler_method, + categorical_features, + problem_type, + mock_imbalanced_data_X_y, + has_minimal_dependencies, + caplog, +): # 0.2 minority:majority class ratios - X, y = mock_imbalanced_data_X_y(problem_type, categorical_features, 'small') - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, sampler_method=sampler_method) + X, y = mock_imbalanced_data_X_y(problem_type, categorical_features, "small") + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type=problem_type, sampler_method=sampler_method + ) # since our default sampler_balanced_ratio for AutoMLSearch is 0.25, we should be adding the samplers when we can pipelines = automl.allowed_pipelines if sampler_method is None: - assert not any(any("sampler" in comp.name for comp in pipeline.component_graph) for pipeline in pipelines) + assert not any( + any("sampler" in comp.name for comp in pipeline.component_graph) + for pipeline in pipelines + ) else: if has_minimal_dependencies: - sampler_method = 'Undersampler' - assert 'Could not import imblearn.over_sampling' in caplog.text - assert all(any(sampler_method in comp.name for comp in pipeline.component_graph) for pipeline in pipelines) + sampler_method = "Undersampler" + assert "Could not import imblearn.over_sampling" in caplog.text + assert all( + any(sampler_method in comp.name for comp in pipeline.component_graph) + for pipeline in pipelines + ) @pytest.mark.parametrize("sampling_ratio", [0.1, 0.2, 0.5, 1]) @pytest.mark.parametrize("sampler", ["Undersampler", "SMOTE Oversampler"]) -def test_automl_search_ratio_overrides_sampler_ratio(sampler, sampling_ratio, mock_imbalanced_data_X_y, has_minimal_dependencies): +def test_automl_search_ratio_overrides_sampler_ratio( + sampler, sampling_ratio, mock_imbalanced_data_X_y, has_minimal_dependencies +): if has_minimal_dependencies and sampler == "SMOTE Oversampler": pytest.skip("Skipping test with minimal dependencies") - X, y = mock_imbalanced_data_X_y("binary", 'none', 'small') + X, y = mock_imbalanced_data_X_y("binary", "none", "small") pipeline_parameters = {sampler: {"sampling_ratio": sampling_ratio}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', sampler_method=sampler, pipeline_parameters=pipeline_parameters, sampler_balanced_ratio=0.5) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + sampler_method=sampler, + pipeline_parameters=pipeline_parameters, + sampler_balanced_ratio=0.5, + ) # make sure that our sampling_balanced_ratio of 0.5 overrides the pipeline params passed in pipelines = automl.allowed_pipelines for pipeline in pipelines: seen_sampler = False for comp in pipeline._component_graph: if comp.name == sampler: - assert comp.parameters['sampling_ratio'] == 0.5 + assert comp.parameters["sampling_ratio"] == 0.5 seen_sampler = True assert seen_sampler -@pytest.mark.parametrize("problem_type,sampling_ratio_dict,length", [("binary", {0: 0.5, 1: 1}, 600), - ("binary", {0: 0.2, 1: 1}, 800), - ("multiclass", {0: 0.5, 1: 1, 2: 1}, 400), - ("multiclass", {0: 0.75, 1: 1, 2: 1}, 333)]) -@patch('evalml.pipelines.components.estimators.Estimator.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.5}) -@patch('evalml.pipelines.MulticlassClassificationPipeline.score', return_value={"Log Loss Multiclass": 0.5}) -def test_automl_search_dictionary_undersampler(mock_multi_score, mock_binary_score, mock_est_fit, - problem_type, sampling_ratio_dict, length): - X = pd.DataFrame({"a": [i for i in range(1200)], - "b": [i % 3 for i in range(1200)]}) - if problem_type == 'binary': +@pytest.mark.parametrize( + "problem_type,sampling_ratio_dict,length", + [ + ("binary", {0: 0.5, 1: 1}, 600), + ("binary", {0: 0.2, 1: 1}, 800), + ("multiclass", {0: 0.5, 1: 1, 2: 1}, 400), + ("multiclass", {0: 0.75, 1: 1, 2: 1}, 333), + ], +) +@patch("evalml.pipelines.components.estimators.Estimator.fit") +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.5}, +) +@patch( + "evalml.pipelines.MulticlassClassificationPipeline.score", + return_value={"Log Loss Multiclass": 0.5}, +) +def test_automl_search_dictionary_undersampler( + mock_multi_score, + mock_binary_score, + mock_est_fit, + problem_type, + sampling_ratio_dict, + length, +): + X = pd.DataFrame({"a": [i for i in range(1200)], "b": [i % 3 for i in range(1200)]}) + if problem_type == "binary": y = pd.Series([0] * 900 + [1] * 300) else: y = pd.Series([0] * 900 + [1] * 150 + [2] * 150) pipeline_parameters = {"Undersampler": {"sampling_ratio_dict": sampling_ratio_dict}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, sampler_method='Undersampler', pipeline_parameters=pipeline_parameters) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + sampler_method="Undersampler", + pipeline_parameters=pipeline_parameters, + ) # check that the sampling dict got set properly pipelines = automl.allowed_pipelines for pipeline in pipelines: seen_under = False for comp in pipeline._component_graph: - if comp.name == 'Undersampler': - assert comp.parameters['sampling_ratio_dict'] == sampling_ratio_dict + if comp.name == "Undersampler": + assert comp.parameters["sampling_ratio_dict"] == sampling_ratio_dict seen_under = True assert seen_under automl.search() @@ -801,33 +1400,59 @@ def test_automl_search_dictionary_undersampler(mock_multi_score, mock_binary_sco assert len(mock_est_fit.call_args[0][0]) == length -@pytest.mark.parametrize("problem_type,sampling_ratio_dict,length", [("binary", {0: 1, 1: 0.5}, 900), - ("binary", {0: 1, 1: 0.8}, 1080), - ("multiclass", {0: 1, 1: 0.5, 2: 0.5}, 1200), - ("multiclass", {0: 1, 1: 0.8, 2: 0.8}, 1560)]) -@patch('evalml.pipelines.components.estimators.Estimator.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.5}) -@patch('evalml.pipelines.MulticlassClassificationPipeline.score', return_value={"Log Loss Multiclass": 0.5}) -def test_automl_search_dictionary_oversampler(mock_multi_score, mock_binary_score, mock_est_fit, - problem_type, sampling_ratio_dict, length): - pytest.importorskip("imblearn", reason="Skipping tests since imblearn isn't installed") +@pytest.mark.parametrize( + "problem_type,sampling_ratio_dict,length", + [ + ("binary", {0: 1, 1: 0.5}, 900), + ("binary", {0: 1, 1: 0.8}, 1080), + ("multiclass", {0: 1, 1: 0.5, 2: 0.5}, 1200), + ("multiclass", {0: 1, 1: 0.8, 2: 0.8}, 1560), + ], +) +@patch("evalml.pipelines.components.estimators.Estimator.fit") +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.5}, +) +@patch( + "evalml.pipelines.MulticlassClassificationPipeline.score", + return_value={"Log Loss Multiclass": 0.5}, +) +def test_automl_search_dictionary_oversampler( + mock_multi_score, + mock_binary_score, + mock_est_fit, + problem_type, + sampling_ratio_dict, + length, +): + pytest.importorskip( + "imblearn", reason="Skipping tests since imblearn isn't installed" + ) # split this from the undersampler since the dictionaries are formatted differently - X = pd.DataFrame({"a": [i for i in range(1200)], - "b": [i % 3 for i in range(1200)]}) - if problem_type == 'binary': + X = pd.DataFrame({"a": [i for i in range(1200)], "b": [i % 3 for i in range(1200)]}) + if problem_type == "binary": y = pd.Series([0] * 900 + [1] * 300) else: y = pd.Series([0] * 900 + [1] * 150 + [2] * 150) # we only test with SMOTE Oversampler since the oversamplers perform similarly - pipeline_parameters = {"SMOTE Oversampler": {"sampling_ratio_dict": sampling_ratio_dict}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, sampler_method='SMOTE Oversampler', pipeline_parameters=pipeline_parameters) + pipeline_parameters = { + "SMOTE Oversampler": {"sampling_ratio_dict": sampling_ratio_dict} + } + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + sampler_method="SMOTE Oversampler", + pipeline_parameters=pipeline_parameters, + ) # check that the sampling dict got set properly pipelines = automl.allowed_pipelines for pipeline in pipelines: seen_under = False for comp in pipeline._component_graph: - if comp.name == 'SMOTE Oversampler': - assert comp.parameters['sampling_ratio_dict'] == sampling_ratio_dict + if comp.name == "SMOTE Oversampler": + assert comp.parameters["sampling_ratio_dict"] == sampling_ratio_dict seen_under = True assert seen_under automl.search() @@ -835,41 +1460,66 @@ def test_automl_search_dictionary_oversampler(mock_multi_score, mock_binary_scor assert len(mock_est_fit.call_args[0][0]) == length -@pytest.mark.parametrize("sampling_ratio_dict,errors", [({0: 1, 1: 0.5}, False), - ({"majority": 1, "minority": 0.5}, True)]) -@pytest.mark.parametrize("sampler", ['Undersampler', 'SMOTE Oversampler']) -@patch('evalml.pipelines.components.estimators.Estimator.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.5}) -def test_automl_search_sampler_dictionary_keys(mock_binary_score, mock_est_fit, sampler, sampling_ratio_dict, errors, has_minimal_dependencies): +@pytest.mark.parametrize( + "sampling_ratio_dict,errors", + [({0: 1, 1: 0.5}, False), ({"majority": 1, "minority": 0.5}, True)], +) +@pytest.mark.parametrize("sampler", ["Undersampler", "SMOTE Oversampler"]) +@patch("evalml.pipelines.components.estimators.Estimator.fit") +@patch( + "evalml.pipelines.BinaryClassificationPipeline.score", + return_value={"Log Loss Binary": 0.5}, +) +def test_automl_search_sampler_dictionary_keys( + mock_binary_score, + mock_est_fit, + sampler, + sampling_ratio_dict, + errors, + has_minimal_dependencies, +): if sampler == "SMOTE Oversampler" and has_minimal_dependencies: pytest.skip("Skipping tests since imblearn isn't installed") # split this from the undersampler since the dictionaries are formatted differently - X = pd.DataFrame({"a": [i for i in range(1200)], - "b": [i % 3 for i in range(1200)]}) + X = pd.DataFrame({"a": [i for i in range(1200)], "b": [i % 3 for i in range(1200)]}) y = pd.Series(["majority"] * 900 + ["minority"] * 300) pipeline_parameters = {sampler: {"sampling_ratio_dict": sampling_ratio_dict}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', error_callback=raise_error_callback, sampler_method=sampler, pipeline_parameters=pipeline_parameters) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + error_callback=raise_error_callback, + sampler_method=sampler, + pipeline_parameters=pipeline_parameters, + ) if errors: - with pytest.raises(ValueError, match='Dictionary keys are different from target'): + with pytest.raises( + ValueError, match="Dictionary keys are different from target" + ): automl.search() else: automl.search() -@pytest.mark.parametrize("sampler", ['Undersampler', 'SMOTE Oversampler']) +@pytest.mark.parametrize("sampler", ["Undersampler", "SMOTE Oversampler"]) def test_automl_search_sampler_k_neighbors_param(sampler, has_minimal_dependencies): if sampler == "SMOTE Oversampler" and has_minimal_dependencies: pytest.skip("Skipping tests since imblearn isn't installed") # split this from the undersampler since the dictionaries are formatted differently - X = pd.DataFrame({"a": [i for i in range(1200)], - "b": [i % 3 for i in range(1200)]}) + X = pd.DataFrame({"a": [i for i in range(1200)], "b": [i % 3 for i in range(1200)]}) y = pd.Series(["majority"] * 900 + ["minority"] * 300) pipeline_parameters = {sampler: {"k_neighbors": 2}} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', sampler_method=sampler, pipeline_parameters=pipeline_parameters) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + sampler_method=sampler, + pipeline_parameters=pipeline_parameters, + ) for pipeline in automl.allowed_pipelines: seen_under = False for comp in pipeline._component_graph: if comp.name == sampler: - assert comp.parameters['k_neighbors'] == 2 + assert comp.parameters["k_neighbors"] == 2 seen_under = True assert seen_under diff --git a/evalml/tests/automl_tests/test_automl_search_regression.py b/evalml/tests/automl_tests/test_automl_search_regression.py index 53d1436893..f1df1acd3e 100644 --- a/evalml/tests/automl_tests/test_automl_search_regression.py +++ b/evalml/tests/automl_tests/test_automl_search_regression.py @@ -10,7 +10,7 @@ from evalml.pipelines import ( PipelineBase, RegressionPipeline, - TimeSeriesRegressionPipeline + TimeSeriesRegressionPipeline, ) from evalml.pipelines.components.utils import get_estimators from evalml.pipelines.utils import make_pipeline @@ -21,7 +21,14 @@ def test_init(X_y_regression): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=3, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + objective="R2", + max_iterations=3, + n_jobs=1, + ) automl.search() assert automl.n_jobs == 1 @@ -30,7 +37,14 @@ def test_init(X_y_regression): automl.best_pipeline.predict(X) # test with dataframes - automl = AutoMLSearch(pd.DataFrame(X), pd.Series(y), problem_type='regression', objective="R2", max_iterations=3, n_jobs=1) + automl = AutoMLSearch( + pd.DataFrame(X), + pd.Series(y), + problem_type="regression", + objective="R2", + max_iterations=3, + n_jobs=1, + ) automl.search() assert isinstance(automl.rankings, pd.DataFrame) @@ -42,12 +56,26 @@ def test_init(X_y_regression): def test_random_seed(X_y_regression): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0, - n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + objective="R2", + max_iterations=5, + random_seed=0, + n_jobs=1, + ) automl.search() - automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0, - n_jobs=1) + automl_1 = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + objective="R2", + max_iterations=5, + random_seed=0, + n_jobs=1, + ) automl_1.search() # need to use assert_frame_equal as R2 could be different at the 10+ decimal @@ -56,8 +84,15 @@ def test_random_seed(X_y_regression): def test_categorical_regression(X_y_categorical_regression): X, y = X_y_categorical_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0, - n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + objective="R2", + max_iterations=5, + random_seed=0, + n_jobs=1, + ) automl.search() assert not automl.rankings["mean_cv_score"].isnull().all() @@ -77,9 +112,16 @@ def add_result_callback(results, trained_pipeline, automl_obj, counts=counts): counts["add_result_callback"] += 1 max_iterations = 3 - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=max_iterations, - start_iteration_callback=start_iteration_callback, - add_result_callback=add_result_callback, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + objective="R2", + max_iterations=max_iterations, + start_iteration_callback=start_iteration_callback, + add_result_callback=add_result_callback, + n_jobs=1, + ) automl.search() assert counts["start_iteration_callback"] == len(get_estimators("regression")) + 1 @@ -89,7 +131,9 @@ def add_result_callback(results, trained_pipeline, automl_obj, counts=counts): def test_plot_disabled_missing_dependency(X_y_regression, has_minimal_dependencies): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=3) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="regression", max_iterations=3 + ) if has_minimal_dependencies: with pytest.raises(AttributeError): automl.plot.search_iteration_plot @@ -98,15 +142,20 @@ def test_plot_disabled_missing_dependency(X_y_regression, has_minimal_dependenci def test_plot_iterations_max_iterations(X_y_regression): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=3, n_jobs=1) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="regression", max_iterations=3, n_jobs=1 + ) automl.search() plot = automl.plot.search_iteration_plot() plot_data = plot.data[0] - x = pd.Series(plot_data['x']) - y = pd.Series(plot_data['y']) + x = pd.Series(plot_data["x"]) + y = pd.Series(plot_data["y"]) assert isinstance(plot, go.Figure) assert x.is_monotonic_increasing @@ -116,15 +165,25 @@ def test_plot_iterations_max_iterations(X_y_regression): def test_plot_iterations_max_time(X_y_regression): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_time=10, random_seed=1, n_jobs=1) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + max_time=10, + random_seed=1, + n_jobs=1, + ) automl.search(show_iteration_plot=False) plot = automl.plot.search_iteration_plot() plot_data = plot.data[0] - x = pd.Series(plot_data['x']) - y = pd.Series(plot_data['y']) + x = pd.Series(plot_data["x"]) + y = pd.Series(plot_data["y"]) assert isinstance(plot, go.Figure) assert x.is_monotonic_increasing @@ -135,26 +194,52 @@ def test_plot_iterations_max_time(X_y_regression): def test_log_metrics_only_passed_directly(X_y_regression): X, y = X_y_regression - with pytest.raises(ObjectiveNotFoundError, match="RootMeanSquaredLogError is not a valid Objective!"): - AutoMLSearch(X_train=X, y_train=y, problem_type='regression', additional_objectives=['RootMeanSquaredLogError', 'MeanSquaredLogError']) - - ar = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', additional_objectives=[RootMeanSquaredLogError(), MeanSquaredLogError()]) - assert ar.additional_objectives[0].name == 'Root Mean Squared Log Error' - assert ar.additional_objectives[1].name == 'Mean Squared Log Error' + with pytest.raises( + ObjectiveNotFoundError, + match="RootMeanSquaredLogError is not a valid Objective!", + ): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + additional_objectives=["RootMeanSquaredLogError", "MeanSquaredLogError"], + ) + + ar = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + additional_objectives=[RootMeanSquaredLogError(), MeanSquaredLogError()], + ) + assert ar.additional_objectives[0].name == "Root Mean Squared Log Error" + assert ar.additional_objectives[1].name == "Mean Squared Log Error" def test_automl_allowed_pipelines_no_allowed_pipelines(X_y_regression): X, y = X_y_regression with pytest.raises(ValueError, match="No allowed pipelines to search"): - AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=None, allowed_model_families=[]) - - -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -def test_automl_allowed_pipelines_specified_allowed_pipelines(mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_pipelines=None, + allowed_model_families=[], + ) + + +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +def test_automl_allowed_pipelines_specified_allowed_pipelines( + mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression +): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', - allowed_pipelines=[dummy_regression_pipeline_class({})], allowed_model_families=None) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_pipelines=[dummy_regression_pipeline_class({})], + allowed_model_families=None, + ) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [dummy_regression_pipeline_class({})] mock_score.return_value = {automl.objective.name: 1.0} @@ -168,13 +253,26 @@ def test_automl_allowed_pipelines_specified_allowed_pipelines(mock_fit, mock_sco assert automl.allowed_model_families == [ModelFamily.NONE] -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -def test_automl_allowed_pipelines_specified_allowed_model_families(mock_fit, mock_score, X_y_regression, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +def test_automl_allowed_pipelines_specified_allowed_model_families( + mock_fit, mock_score, X_y_regression, assert_allowed_pipelines_equal_helper +): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=None, allowed_model_families=[ModelFamily.RANDOM_FOREST]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_pipelines=None, + allowed_model_families=[ModelFamily.RANDOM_FOREST], + ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.REGRESSION) for estimator in get_estimators(ProblemTypes.REGRESSION, model_families=[ModelFamily.RANDOM_FOREST])] + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.REGRESSION) + for estimator in get_estimators( + ProblemTypes.REGRESSION, model_families=[ModelFamily.RANDOM_FOREST] + ) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set([ModelFamily.RANDOM_FOREST]) automl.search() @@ -183,8 +281,19 @@ def test_automl_allowed_pipelines_specified_allowed_model_families(mock_fit, moc mock_fit.reset_mock() mock_score.reset_mock() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=None, allowed_model_families=['random_forest']) - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.REGRESSION) for estimator in get_estimators(ProblemTypes.REGRESSION, model_families=[ModelFamily.RANDOM_FOREST])] + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_pipelines=None, + allowed_model_families=["random_forest"], + ) + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.REGRESSION) + for estimator in get_estimators( + ProblemTypes.REGRESSION, model_families=[ModelFamily.RANDOM_FOREST] + ) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set([ModelFamily.RANDOM_FOREST]) automl.search() @@ -192,90 +301,155 @@ def test_automl_allowed_pipelines_specified_allowed_model_families(mock_fit, moc mock_score.assert_called() -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -def test_automl_allowed_pipelines_init_allowed_both_not_specified(mock_fit, mock_score, X_y_regression, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +def test_automl_allowed_pipelines_init_allowed_both_not_specified( + mock_fit, mock_score, X_y_regression, assert_allowed_pipelines_equal_helper +): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=None, allowed_model_families=None) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_pipelines=None, + allowed_model_families=None, + ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [make_pipeline(X, y, estimator, ProblemTypes.REGRESSION) for estimator in get_estimators(ProblemTypes.REGRESSION, model_families=None)] + expected_pipelines = [ + make_pipeline(X, y, estimator, ProblemTypes.REGRESSION) + for estimator in get_estimators(ProblemTypes.REGRESSION, model_families=None) + ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) - assert set(automl.allowed_model_families) == set([p.model_family for p in expected_pipelines]) + assert set(automl.allowed_model_families) == set( + [p.model_family for p in expected_pipelines] + ) automl.search() mock_fit.assert_called() mock_score.assert_called() -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -def test_automl_allowed_pipelines_init_allowed_both_specified(mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression, assert_allowed_pipelines_equal_helper): +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +def test_automl_allowed_pipelines_init_allowed_both_specified( + mock_fit, + mock_score, + dummy_regression_pipeline_class, + X_y_regression, + assert_allowed_pipelines_equal_helper, +): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=[dummy_regression_pipeline_class({})], allowed_model_families=[ModelFamily.RANDOM_FOREST]) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + allowed_pipelines=[dummy_regression_pipeline_class({})], + allowed_model_families=[ModelFamily.RANDOM_FOREST], + ) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [dummy_regression_pipeline_class({})] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) - assert set(automl.allowed_model_families) == set([p.model_family for p in expected_pipelines]) + assert set(automl.allowed_model_families) == set( + [p.model_family for p in expected_pipelines] + ) automl.search() mock_fit.assert_called() mock_score.assert_called() -@pytest.mark.parametrize('is_linear', [True, False]) -@patch('evalml.pipelines.RegressionPipeline.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -def test_automl_allowed_pipelines_search(mock_fit, mock_score, is_linear, dummy_regression_pipeline_class, nonlinear_regression_pipeline_class, X_y_regression): +@pytest.mark.parametrize("is_linear", [True, False]) +@patch("evalml.pipelines.RegressionPipeline.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +def test_automl_allowed_pipelines_search( + mock_fit, + mock_score, + is_linear, + dummy_regression_pipeline_class, + nonlinear_regression_pipeline_class, + X_y_regression, +): X, y = X_y_regression - mock_score.return_value = {'R2': 1.0} - pipeline_class = dummy_regression_pipeline_class if is_linear else nonlinear_regression_pipeline_class + mock_score.return_value = {"R2": 1.0} + pipeline_class = ( + dummy_regression_pipeline_class + if is_linear + else nonlinear_regression_pipeline_class + ) allowed_pipelines = [pipeline_class({})] start_iteration_callback = MagicMock() - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', - max_iterations=2, start_iteration_callback=start_iteration_callback, - allowed_pipelines=allowed_pipelines) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="regression", + max_iterations=2, + start_iteration_callback=start_iteration_callback, + allowed_pipelines=allowed_pipelines, + ) automl.search() assert start_iteration_callback.call_count == 2 - assert isinstance(start_iteration_callback.call_args_list[0][0][0], RegressionPipeline) + assert isinstance( + start_iteration_callback.call_args_list[0][0][0], RegressionPipeline + ) assert isinstance(start_iteration_callback.call_args_list[1][0][0], pipeline_class) -@patch('evalml.pipelines.TimeSeriesRegressionPipeline.score', return_value={"R2": 0.3}) -@patch('evalml.pipelines.TimeSeriesRegressionPipeline.fit') +@patch("evalml.pipelines.TimeSeriesRegressionPipeline.score", return_value={"R2": 0.3}) +@patch("evalml.pipelines.TimeSeriesRegressionPipeline.fit") def test_automl_supports_time_series_regression(mock_fit, mock_score, X_y_regression): X, y = X_y_regression X = pd.DataFrame(X, columns=[f"Column_{str(i)}" for i in range(20)]) - X["Date"] = pd.date_range(start='1/1/2018', periods=X.shape[0]) - - configuration = {"date_index": "Date", "gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True} + X["Date"] = pd.date_range(start="1/1/2018", periods=X.shape[0]) + + configuration = { + "date_index": "Date", + "gap": 0, + "max_delay": 0, + "delay_target": False, + "delay_features": True, + } - automl = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration=configuration, - max_batches=2) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="time series regression", + problem_configuration=configuration, + max_batches=2, + ) automl.search() assert isinstance(automl.data_splitter, TimeSeriesSplit) - dt = configuration.pop('date_index') - for result in automl.results['pipeline_results'].values(): - assert result['pipeline_class'] == TimeSeriesRegressionPipeline + dt = configuration.pop("date_index") + for result in automl.results["pipeline_results"].values(): + assert result["pipeline_class"] == TimeSeriesRegressionPipeline if result["id"] == 0: continue - if 'ARIMA Regressor' in result["parameters"]: - dt_ = result['parameters']['ARIMA Regressor'].pop('date_index') - assert 'DateTime Featurization Component' not in result['parameters'].keys() - assert 'Delayed Feature Transformer' not in result['parameters'].keys() + if "ARIMA Regressor" in result["parameters"]: + dt_ = result["parameters"]["ARIMA Regressor"].pop("date_index") + assert "DateTime Featurization Component" not in result["parameters"].keys() + assert "Delayed Feature Transformer" not in result["parameters"].keys() else: - dt_ = result['parameters']['Delayed Feature Transformer'].pop('date_index') + dt_ = result["parameters"]["Delayed Feature Transformer"].pop("date_index") assert dt == dt_ for param_key, param_val in configuration.items(): - if 'ARIMA Regressor' not in result["parameters"]: - assert result['parameters']['Delayed Feature Transformer'][param_key] == configuration[param_key] - assert result['parameters']['pipeline'][param_key] == configuration[param_key] - - -@pytest.mark.parametrize("sampler_method", [None, 'auto', 'Undersampler', 'Oversampler']) + if "ARIMA Regressor" not in result["parameters"]: + assert ( + result["parameters"]["Delayed Feature Transformer"][param_key] + == configuration[param_key] + ) + assert ( + result["parameters"]["pipeline"][param_key] == configuration[param_key] + ) + + +@pytest.mark.parametrize( + "sampler_method", [None, "auto", "Undersampler", "Oversampler"] +) def test_automl_regression_no_sampler(sampler_method, X_y_regression): X, y = X_y_regression - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', sampler_method=sampler_method) + automl = AutoMLSearch( + X_train=X, y_train=y, problem_type="regression", sampler_method=sampler_method + ) for pipeline in automl.allowed_pipelines: assert not any("sampler" in c.name for c in pipeline.component_graph) diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index b42834a385..b92be15188 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -13,29 +13,43 @@ get_default_primary_search_objective, get_hyperparameter_ranges, make_data_splitter, - tune_binary_threshold + tune_binary_threshold, ) from evalml.objectives import F1, R2, LogLossBinary, LogLossMulticlass from evalml.pipelines import BinaryClassificationPipeline -from evalml.preprocessing.data_splitters import ( - TimeSeriesSplit, - TrainingValidationSplit -) +from evalml.preprocessing.data_splitters import TimeSeriesSplit, TrainingValidationSplit from evalml.problem_types import ProblemTypes from evalml.utils.woodwork_utils import infer_feature_types def test_get_default_primary_search_objective(): assert isinstance(get_default_primary_search_objective("binary"), LogLossBinary) - assert isinstance(get_default_primary_search_objective(ProblemTypes.BINARY), LogLossBinary) - assert isinstance(get_default_primary_search_objective("multiclass"), LogLossMulticlass) - assert isinstance(get_default_primary_search_objective(ProblemTypes.MULTICLASS), LogLossMulticlass) + assert isinstance( + get_default_primary_search_objective(ProblemTypes.BINARY), LogLossBinary + ) + assert isinstance( + get_default_primary_search_objective("multiclass"), LogLossMulticlass + ) + assert isinstance( + get_default_primary_search_objective(ProblemTypes.MULTICLASS), LogLossMulticlass + ) assert isinstance(get_default_primary_search_objective("regression"), R2) assert isinstance(get_default_primary_search_objective(ProblemTypes.REGRESSION), R2) - assert isinstance(get_default_primary_search_objective("time series binary"), LogLossBinary) - assert isinstance(get_default_primary_search_objective(ProblemTypes.TIME_SERIES_BINARY), LogLossBinary) - assert isinstance(get_default_primary_search_objective("time series multiclass"), LogLossMulticlass) - assert isinstance(get_default_primary_search_objective(ProblemTypes.TIME_SERIES_MULTICLASS), LogLossMulticlass) + assert isinstance( + get_default_primary_search_objective("time series binary"), LogLossBinary + ) + assert isinstance( + get_default_primary_search_objective(ProblemTypes.TIME_SERIES_BINARY), + LogLossBinary, + ) + assert isinstance( + get_default_primary_search_objective("time series multiclass"), + LogLossMulticlass, + ) + assert isinstance( + get_default_primary_search_objective(ProblemTypes.TIME_SERIES_MULTICLASS), + LogLossMulticlass, + ) with pytest.raises(KeyError, match="Problem type 'auto' does not exist"): get_default_primary_search_objective("auto") @@ -46,18 +60,25 @@ def test_make_data_splitter_default(problem_type, large_data): n = 10 if large_data: n = _LARGE_DATA_ROW_THRESHOLD + 1 - X = pd.DataFrame({'col_0': list(range(n)), - 'target': list(range(n))}) - y = X.pop('target') + X = pd.DataFrame({"col_0": list(range(n)), "target": list(range(n))}) + y = X.pop("target") problem_configuration = None - if problem_type in [ProblemTypes.TIME_SERIES_REGRESSION, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS]: - problem_configuration = {'gap': 1, 'max_delay': 7, 'date_index': None} - - data_splitter = make_data_splitter(X, y, problem_type, problem_configuration=problem_configuration) - if large_data and problem_type in [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: + if problem_type in [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ]: + problem_configuration = {"gap": 1, "max_delay": 7, "date_index": None} + + data_splitter = make_data_splitter( + X, y, problem_type, problem_configuration=problem_configuration + ) + if large_data and problem_type in [ + ProblemTypes.REGRESSION, + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ]: assert isinstance(data_splitter, TrainingValidationSplit) assert data_splitter.stratify is None assert data_splitter.random_seed == 0 @@ -77,9 +98,11 @@ def test_make_data_splitter_default(problem_type, large_data): assert data_splitter.shuffle assert data_splitter.random_state == 0 - if problem_type in [ProblemTypes.TIME_SERIES_REGRESSION, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS]: + if problem_type in [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ]: assert isinstance(data_splitter, TimeSeriesSplit) assert data_splitter.n_splits == 3 assert data_splitter.gap == 1 @@ -87,17 +110,23 @@ def test_make_data_splitter_default(problem_type, large_data): assert data_splitter.date_index is None -@pytest.mark.parametrize("problem_type, expected_data_splitter", [(ProblemTypes.REGRESSION, KFold), - (ProblemTypes.BINARY, StratifiedKFold), - (ProblemTypes.MULTICLASS, StratifiedKFold)]) +@pytest.mark.parametrize( + "problem_type, expected_data_splitter", + [ + (ProblemTypes.REGRESSION, KFold), + (ProblemTypes.BINARY, StratifiedKFold), + (ProblemTypes.MULTICLASS, StratifiedKFold), + ], +) def test_make_data_splitter_parameters(problem_type, expected_data_splitter): n = 10 - X = pd.DataFrame({'col_0': list(range(n)), - 'target': list(range(n))}) - y = X.pop('target') + X = pd.DataFrame({"col_0": list(range(n)), "target": list(range(n))}) + y = X.pop("target") random_seed = 42 - data_splitter = make_data_splitter(X, y, problem_type, n_splits=5, random_seed=random_seed) + data_splitter = make_data_splitter( + X, y, problem_type, n_splits=5, random_seed=random_seed + ) assert isinstance(data_splitter, expected_data_splitter) assert data_splitter.n_splits == 5 assert data_splitter.shuffle @@ -106,12 +135,22 @@ def test_make_data_splitter_parameters(problem_type, expected_data_splitter): def test_make_data_splitter_parameters_time_series(): n = 10 - X = pd.DataFrame({'col_0': list(range(n)), - 'target': list(range(n))}) - y = X.pop('target') - - for problem_type in [ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS]: - data_splitter = make_data_splitter(X, y, problem_type, problem_configuration={'gap': 1, 'max_delay': 7, 'date_index': None}, n_splits=5, shuffle=False) + X = pd.DataFrame({"col_0": list(range(n)), "target": list(range(n))}) + y = X.pop("target") + + for problem_type in [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ]: + data_splitter = make_data_splitter( + X, + y, + problem_type, + problem_configuration={"gap": 1, "max_delay": 7, "date_index": None}, + n_splits=5, + shuffle=False, + ) assert isinstance(data_splitter, TimeSeriesSplit) assert data_splitter.n_splits == 5 assert data_splitter.gap == 1 @@ -121,64 +160,92 @@ def test_make_data_splitter_parameters_time_series(): def test_make_data_splitter_error(): n = 10 - X = pd.DataFrame({'col_0': list(range(n)), - 'target': list(range(n))}) - y = X.pop('target') + X = pd.DataFrame({"col_0": list(range(n)), "target": list(range(n))}) + y = X.pop("target") - with pytest.raises(ValueError, match="problem_configuration is required for time series problem types"): + with pytest.raises( + ValueError, + match="problem_configuration is required for time series problem types", + ): make_data_splitter(X, y, ProblemTypes.TIME_SERIES_REGRESSION) with pytest.raises(KeyError, match="Problem type 'XYZ' does not exist"): - make_data_splitter(X, y, 'XYZ') + make_data_splitter(X, y, "XYZ") -@pytest.mark.parametrize("problem_type", [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS], +) @pytest.mark.parametrize("large_data", [True, False]) def test_make_data_splitter_error_shuffle_random_state(problem_type, large_data): n = 10 if large_data: n = _LARGE_DATA_ROW_THRESHOLD + 1 - X = pd.DataFrame({'col_0': list(range(n)), - 'target': list(range(n))}) - y = X.pop('target') + X = pd.DataFrame({"col_0": list(range(n)), "target": list(range(n))}) + y = X.pop("target") if large_data: - make_data_splitter(X, y, problem_type, n_splits=5, shuffle=False, random_seed=42) + make_data_splitter( + X, y, problem_type, n_splits=5, shuffle=False, random_seed=42 + ) else: - with pytest.raises(ValueError, match="Setting a random_state has no effect since shuffle is False."): - make_data_splitter(X, y, problem_type, n_splits=5, shuffle=False, random_seed=42) - - -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets', side_effect=lambda y: y) -@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_tune_binary_threshold(mock_fit, mock_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, - dummy_binary_pipeline_class, X_y_binary): + with pytest.raises( + ValueError, + match="Setting a random_state has no effect since shuffle is False.", + ): + make_data_splitter( + X, y, problem_type, n_splits=5, shuffle=False, random_seed=42 + ) + + +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") +@patch( + "evalml.pipelines.BinaryClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_tune_binary_threshold( + mock_fit, + mock_score, + mock_predict_proba, + mock_encode_targets, + mock_optimize_threshold, + dummy_binary_pipeline_class, + X_y_binary, +): mock_optimize_threshold.return_value = 0.42 - mock_score.return_value = {'F1': 1.0} + mock_score.return_value = {"F1": 1.0} X, y = X_y_binary X = infer_feature_types(X) y = infer_feature_types(y) pipeline = dummy_binary_pipeline_class({}) - tune_binary_threshold(pipeline, F1(), 'binary', X, y) + tune_binary_threshold(pipeline, F1(), "binary", X, y) assert pipeline.threshold == 0.42 pipeline = dummy_binary_pipeline_class({}) - tune_binary_threshold(pipeline, F1(), 'binary', None, None) + tune_binary_threshold(pipeline, F1(), "binary", None, None) assert pipeline.threshold == 0.5 pipeline = dummy_binary_pipeline_class({}) - tune_binary_threshold(pipeline, F1(), 'multiclass', X, y) + tune_binary_threshold(pipeline, F1(), "multiclass", X, y) assert pipeline.threshold is None -@pytest.mark.parametrize("size", ['large', 'small']) -@pytest.mark.parametrize("categorical_columns", ['none', 'all', 'some']) -@pytest.mark.parametrize("problem_type", ['binary', 'multiclass']) +@pytest.mark.parametrize("size", ["large", "small"]) +@pytest.mark.parametrize("categorical_columns", ["none", "all", "some"]) +@pytest.mark.parametrize("problem_type", ["binary", "multiclass"]) @pytest.mark.parametrize("sampler_balanced_ratio", [1, 0.5, 0.25, 0.2, 0.1, 0.05]) -def test_get_best_sampler_for_data_auto(sampler_balanced_ratio, problem_type, categorical_columns, size, mock_imbalanced_data_X_y, has_minimal_dependencies): +def test_get_best_sampler_for_data_auto( + sampler_balanced_ratio, + problem_type, + categorical_columns, + size, + mock_imbalanced_data_X_y, + has_minimal_dependencies, +): X, y = mock_imbalanced_data_X_y(problem_type, categorical_columns, size) name_output = get_best_sampler_for_data(X, y, "auto", sampler_balanced_ratio) print(size, len(y)) @@ -186,47 +253,56 @@ def test_get_best_sampler_for_data_auto(sampler_balanced_ratio, problem_type, ca # the imbalanced data we get has a class ratio of 0.2 minority:majority assert name_output is None else: - if size == 'large' or has_minimal_dependencies: - assert name_output == 'Undersampler' + if size == "large" or has_minimal_dependencies: + assert name_output == "Undersampler" else: - if categorical_columns == 'none': - assert name_output == 'SMOTE Oversampler' - elif categorical_columns == 'some': - assert name_output == 'SMOTENC Oversampler' + if categorical_columns == "none": + assert name_output == "SMOTE Oversampler" + elif categorical_columns == "some": + assert name_output == "SMOTENC Oversampler" else: - assert name_output == 'SMOTEN Oversampler' + assert name_output == "SMOTEN Oversampler" @pytest.mark.parametrize("sampler_method", ["Undersampler", "Oversampler"]) -@pytest.mark.parametrize("categorical_columns", ['none', 'all', 'some']) -def test_get_best_sampler_for_data_sampler_method(categorical_columns, sampler_method, mock_imbalanced_data_X_y, has_minimal_dependencies): - X, y = mock_imbalanced_data_X_y('binary', categorical_columns, 'large') +@pytest.mark.parametrize("categorical_columns", ["none", "all", "some"]) +def test_get_best_sampler_for_data_sampler_method( + categorical_columns, + sampler_method, + mock_imbalanced_data_X_y, + has_minimal_dependencies, +): + X, y = mock_imbalanced_data_X_y("binary", categorical_columns, "large") name_output = get_best_sampler_for_data(X, y, sampler_method, 0.5) - if sampler_method == 'Undersampler' or has_minimal_dependencies: - assert name_output == 'Undersampler' + if sampler_method == "Undersampler" or has_minimal_dependencies: + assert name_output == "Undersampler" else: - if categorical_columns == 'none': - assert name_output == 'SMOTE Oversampler' - elif categorical_columns == 'some': - assert name_output == 'SMOTENC Oversampler' + if categorical_columns == "none": + assert name_output == "SMOTE Oversampler" + elif categorical_columns == "some": + assert name_output == "SMOTENC Oversampler" else: - assert name_output == 'SMOTEN Oversampler' + assert name_output == "SMOTEN Oversampler" def test_get_hyperparameter_ranges(): - pipeline_ = BinaryClassificationPipeline(component_graph=["Imputer", "Random Forest Classifier"]) + pipeline_ = BinaryClassificationPipeline( + component_graph=["Imputer", "Random Forest Classifier"] + ) custom_hyperparameters_ = { - "Imputer": { - "numeric_impute_strategy": Categorical(["most_frequent", "mean"]) - }, - "Random Forest Classifier": { - "n_estimators": Integer(150, 160) - } + "Imputer": {"numeric_impute_strategy": Categorical(["most_frequent", "mean"])}, + "Random Forest Classifier": {"n_estimators": Integer(150, 160)}, } - algo = IterativeAlgorithm(allowed_pipelines=[pipeline_], - random_seed=0, - custom_hyperparameters=custom_hyperparameters_) - algo_ranges = algo._tuners['Random Forest Classifier w/ Imputer']._pipeline_hyperparameter_ranges - hyper_ranges = get_hyperparameter_ranges(pipeline_.component_graph, custom_hyperparameters_) + algo = IterativeAlgorithm( + allowed_pipelines=[pipeline_], + random_seed=0, + custom_hyperparameters=custom_hyperparameters_, + ) + algo_ranges = algo._tuners[ + "Random Forest Classifier w/ Imputer" + ]._pipeline_hyperparameter_ranges + hyper_ranges = get_hyperparameter_ranges( + pipeline_.component_graph, custom_hyperparameters_ + ) assert algo_ranges == hyper_ranges diff --git a/evalml/tests/automl_tests/test_dask_engine.py b/evalml/tests/automl_tests/test_dask_engine.py index 284985eaeb..6b6b28bc9a 100644 --- a/evalml/tests/automl_tests/test_dask_engine.py +++ b/evalml/tests/automl_tests/test_dask_engine.py @@ -10,7 +10,7 @@ from evalml.automl.engine.engine_base import ( JobLogger, evaluate_pipeline, - train_pipeline + train_pipeline, ) from evalml.automl.engine.sequential_engine import SequentialEngine from evalml.automl.utils import AutoMLConfig @@ -19,13 +19,12 @@ from evalml.tests.automl_tests.dask_test_utils import ( TestPipelineSlow, TestSchemaCheckPipeline, - automl_data + automl_data, ) @pytest.mark.usefixtures("X_y_binary_cls") class TestDaskEngine(unittest.TestCase): - @classmethod def setUpClass(cls) -> None: cls.client = Client() @@ -34,41 +33,62 @@ def test_init(self): engine = DaskEngine(client=self.client) assert engine.client == self.client - with pytest.raises(TypeError, match="Expected dask.distributed.Client, received"): + with pytest.raises( + TypeError, match="Expected dask.distributed.Client, received" + ): DaskEngine(client="Client") def test_submit_training_job_single(self): - """ Test that training a single pipeline using the parallel engine produces the - same results as simply running the train_pipeline function. """ + """Test that training a single pipeline using the parallel engine produces the + same results as simply running the train_pipeline function.""" X, y = self.X_y_binary engine = DaskEngine(client=self.client) - pipeline = BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ) # Verify that engine fits a pipeline - pipeline_future = engine.submit_training_job(X=X, y=y, automl_config=automl_data, pipeline=pipeline) + pipeline_future = engine.submit_training_job( + X=X, y=y, automl_config=automl_data, pipeline=pipeline + ) dask_pipeline_fitted = pipeline_future.get_result() assert dask_pipeline_fitted._is_fitted # Verify parallelization has no effect on output of function - original_pipeline_fitted = train_pipeline(pipeline, X, y, optimize_thresholds=automl_data.optimize_thresholds, - objective=automl_data.objective) + original_pipeline_fitted = train_pipeline( + pipeline, + X, + y, + optimize_thresholds=automl_data.optimize_thresholds, + objective=automl_data.objective, + ) assert dask_pipeline_fitted == original_pipeline_fitted - pd.testing.assert_series_equal(dask_pipeline_fitted.predict(X), original_pipeline_fitted.predict(X)) + pd.testing.assert_series_equal( + dask_pipeline_fitted.predict(X), original_pipeline_fitted.predict(X) + ) def test_submit_training_jobs_multiple(self): - """ Test that training multiple pipelines using the parallel engine produces the - same results as the sequential engine. """ + """Test that training multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = self.X_y_binary - pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}), - BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), - BinaryClassificationPipeline(component_graph=["SVM Classifier"])] + pipelines = [ + BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ), + BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), + BinaryClassificationPipeline(component_graph=["SVM Classifier"]), + ] def fit_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: - futures.append(engine.submit_training_job(X=X, y=y, automl_config=automl_data, pipeline=pipeline)) + futures.append( + engine.submit_training_job( + X=X, y=y, automl_config=automl_data, pipeline=pipeline + ) + ) results = [f.get_result() for f in futures] return results @@ -88,25 +108,30 @@ def fit_pipelines(pipelines, engine): assert par_pipeline in seq_pipelines def test_submit_evaluate_job_single(self): - """ Test that evaluating a single pipeline using the parallel engine produces the - same results as simply running the evaluate_pipeline function. """ + """Test that evaluating a single pipeline using the parallel engine produces the + same results as simply running the evaluate_pipeline function.""" X, y = self.X_y_binary X.ww.init() y = ww.init_series(y) - pipeline = BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ) engine = DaskEngine(client=self.client) # Verify that engine evaluates a pipeline - pipeline_future = engine.submit_evaluation_job(X=X, y=y, - automl_config=automl_data, pipeline=pipeline) + pipeline_future = engine.submit_evaluation_job( + X=X, y=y, automl_config=automl_data, pipeline=pipeline + ) assert isinstance(pipeline_future, DaskComputation) par_eval_results = pipeline_future.get_result() - original_eval_results = evaluate_pipeline(pipeline, automl_config=automl_data, X=X, y=y, logger=JobLogger()) + original_eval_results = evaluate_pipeline( + pipeline, automl_config=automl_data, X=X, y=y, logger=JobLogger() + ) # Ensure we get back the same output as the parallelized function. assert len(par_eval_results) == 3 @@ -125,25 +150,35 @@ def test_submit_evaluate_job_single(self): # Make sure a properly filled logger comes back. assert isinstance(par_eval_results.get("logger"), JobLogger) - assert par_eval_results.get("logger").logs == original_eval_results.get("logger").logs + assert ( + par_eval_results.get("logger").logs + == original_eval_results.get("logger").logs + ) def test_submit_evaluate_jobs_multiple(self): - """ Test that evaluating multiple pipelines using the parallel engine produces the - same results as the sequential engine. """ + """Test that evaluating multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = self.X_y_binary X.ww.init() y = ww.init_series(y) - pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}), - BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), - BinaryClassificationPipeline(component_graph=["SVM Classifier"])] + pipelines = [ + BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ), + BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), + BinaryClassificationPipeline(component_graph=["SVM Classifier"]), + ] def eval_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: - futures.append(engine.submit_evaluation_job(X=X, y=y, - automl_config=automl_data, pipeline=pipeline)) + futures.append( + engine.submit_evaluation_job( + X=X, y=y, automl_config=automl_data, pipeline=pipeline + ) + ) results = [f.get_result() for f in futures] return results @@ -170,23 +205,30 @@ def eval_pipelines(pipelines, engine): assert par_pipeline in seq_pipelines def test_submit_scoring_job_single(self): - """ Test that scoring a single pipeline using the parallel engine produces the - same results as simply running the score_pipeline function. """ + """Test that scoring a single pipeline using the parallel engine produces the + same results as simply running the score_pipeline function.""" X, y = self.X_y_binary X.ww.init() y = ww.init_series(y) - pipeline = BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ) engine = DaskEngine(client=self.client) objectives = [automl_data.objective] - pipeline_future = engine.submit_training_job(X=X, y=y, - automl_config=automl_data, pipeline=pipeline) + pipeline_future = engine.submit_training_job( + X=X, y=y, automl_config=automl_data, pipeline=pipeline + ) pipeline = pipeline_future.get_result() - pipeline_score_future = engine.submit_scoring_job(X=X, y=y, - automl_config=automl_data, pipeline=pipeline, - objectives=objectives) + pipeline_score_future = engine.submit_scoring_job( + X=X, + y=y, + automl_config=automl_data, + pipeline=pipeline, + objectives=objectives, + ) assert isinstance(pipeline_score_future, DaskComputation) pipeline_score = pipeline_score_future.get_result() @@ -196,28 +238,41 @@ def test_submit_scoring_job_single(self): assert pipeline_score == original_pipeline_score def test_submit_scoring_jobs_multiple(self): - """ Test that scoring multiple pipelines using the parallel engine produces the - same results as the sequential engine. """ + """Test that scoring multiple pipelines using the parallel engine produces the + same results as the sequential engine.""" X, y = self.X_y_binary X.ww.init() y = ww.init_series(y) - pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"], - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}), - BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), - BinaryClassificationPipeline(component_graph=["SVM Classifier"])] + pipelines = [ + BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ), + BinaryClassificationPipeline(component_graph=["Baseline Classifier"]), + BinaryClassificationPipeline(component_graph=["SVM Classifier"]), + ] def score_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: - futures.append(engine.submit_training_job(X=X, y=y, - automl_config=automl_data, pipeline=pipeline)) + futures.append( + engine.submit_training_job( + X=X, y=y, automl_config=automl_data, pipeline=pipeline + ) + ) pipelines = [f.get_result() for f in futures] futures = [] for pipeline in pipelines: - futures.append(engine.submit_scoring_job(X=X, y=y, - automl_config=automl_data, pipeline=pipeline, - objectives=[automl_data.objective])) + futures.append( + engine.submit_scoring_job( + X=X, + y=y, + automl_config=automl_data, + pipeline=pipeline, + objectives=[automl_data.objective], + ) + ) results = [f.get_result() for f in futures] return results @@ -232,14 +287,16 @@ def score_pipelines(pipelines, engine): assert set(par_scores) == set(seq_scores) def test_cancel_job(self): - """ Test that training a single pipeline using the parallel engine produces the - same results as simply running the train_pipeline function. """ + """Test that training a single pipeline using the parallel engine produces the + same results as simply running the train_pipeline function.""" X, y = self.X_y_binary engine = DaskEngine(client=self.client) pipeline = TestPipelineSlow({"Logistic Regression Classifier": {"n_jobs": 1}}) # Verify that engine fits a pipeline - pipeline_future = engine.submit_training_job(X=X, y=y, automl_config=automl_data, pipeline=pipeline) + pipeline_future = engine.submit_training_job( + X=X, y=y, automl_config=automl_data, pipeline=pipeline + ) pipeline_future.cancel() assert pipeline_future.is_cancelled @@ -247,29 +304,44 @@ def test_dask_sends_woodwork_schema(self): X, y = self.X_y_binary engine = DaskEngine(client=self.client) - X.ww.init(logical_types={0: "Categorical"}, semantic_tags={0: ['my cool feature']}) + X.ww.init( + logical_types={0: "Categorical"}, semantic_tags={0: ["my cool feature"]} + ) y = ww.init_series(y) - new_config = AutoMLConfig(data_splitter=automl_data.data_splitter, - problem_type=automl_data.problem_type, - objective=automl_data.objective, - additional_objectives=automl_data.additional_objectives, - optimize_thresholds=automl_data.optimize_thresholds, - error_callback=automl_data.error_callback, - random_seed=automl_data.random_seed, - X_schema=X.ww.schema, - y_schema=y.ww.schema) + new_config = AutoMLConfig( + data_splitter=automl_data.data_splitter, + problem_type=automl_data.problem_type, + objective=automl_data.objective, + additional_objectives=automl_data.additional_objectives, + optimize_thresholds=automl_data.optimize_thresholds, + error_callback=automl_data.error_callback, + random_seed=automl_data.random_seed, + X_schema=X.ww.schema, + y_schema=y.ww.schema, + ) # TestSchemaCheckPipeline will verify that the schema is preserved by the time we call # pipeline.fit and pipeline.score - pipeline = TestSchemaCheckPipeline(component_graph=["One Hot Encoder", "Logistic Regression Classifier"], - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, - X_schema_to_check=X.ww.schema, y_schema_to_check=y.ww.schema) - - future = engine.submit_training_job(X=X, y=y, automl_config=new_config, pipeline=pipeline) + pipeline = TestSchemaCheckPipeline( + component_graph=["One Hot Encoder", "Logistic Regression Classifier"], + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + X_schema_to_check=X.ww.schema, + y_schema_to_check=y.ww.schema, + ) + + future = engine.submit_training_job( + X=X, y=y, automl_config=new_config, pipeline=pipeline + ) fitted_pipeline = future.get_result() - future = engine.submit_scoring_job(X=X, y=y, automl_config=new_config, pipeline=fitted_pipeline, objectives=["F1"]) + future = engine.submit_scoring_job( + X=X, + y=y, + automl_config=new_config, + pipeline=fitted_pipeline, + objectives=["F1"], + ) _ = future.get_result() future = engine.submit_evaluation_job(new_config, pipeline, X, y) diff --git a/evalml/tests/automl_tests/test_engine_base.py b/evalml/tests/automl_tests/test_engine_base.py index 5570e2667a..200c9137cf 100644 --- a/evalml/tests/automl_tests/test_engine_base.py +++ b/evalml/tests/automl_tests/test_engine_base.py @@ -11,62 +11,111 @@ from evalml.utils import get_logger -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_train_and_score_pipelines(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_train_and_score_pipelines( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +): X, y = X_y_binary - mock_score.return_value = {'Log Loss Binary': 0.42} - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time=1, max_batches=1, - allowed_pipelines=[dummy_binary_pipeline_class({})]) + mock_score.return_value = {"Log Loss Binary": 0.42} + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_time=1, + max_batches=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) pipeline = dummy_binary_pipeline_class({}) - evaluation_result = evaluate_pipeline(pipeline, automl.automl_config, automl.X_train, automl.y_train, logger=MagicMock()).get("scores") + evaluation_result = evaluate_pipeline( + pipeline, + automl.automl_config, + automl.X_train, + automl.y_train, + logger=MagicMock(), + ).get("scores") assert mock_fit.call_count == automl.data_splitter.get_n_splits() assert mock_score.call_count == automl.data_splitter.get_n_splits() - assert evaluation_result.get('training_time') is not None - assert evaluation_result.get('cv_score_mean') == 0.42 - pd.testing.assert_series_equal(evaluation_result.get('cv_scores'), pd.Series([0.42] * 3)) + assert evaluation_result.get("training_time") is not None + assert evaluation_result.get("cv_score_mean") == 0.42 + pd.testing.assert_series_equal( + evaluation_result.get("cv_scores"), pd.Series([0.42] * 3) + ) for i in range(automl.data_splitter.get_n_splits()): - assert evaluation_result['cv_data'][i]['all_objective_scores']['Log Loss Binary'] == 0.42 + assert ( + evaluation_result["cv_data"][i]["all_objective_scores"]["Log Loss Binary"] + == 0.42 + ) -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -def test_train_and_score_pipelines_error(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary, caplog): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +def test_train_and_score_pipelines_error( + mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary, caplog +): X, y = X_y_binary - mock_score.side_effect = Exception('yeet') - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time=1, max_batches=1, - allowed_pipelines=[dummy_binary_pipeline_class({})]) + mock_score.side_effect = Exception("yeet") + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_time=1, + max_batches=1, + allowed_pipelines=[dummy_binary_pipeline_class({})], + ) pipeline = dummy_binary_pipeline_class({}) job_log = JobLogger() - result = evaluate_pipeline(pipeline, automl.automl_config, automl.X_train, automl.y_train, logger=job_log) + result = evaluate_pipeline( + pipeline, automl.automl_config, automl.X_train, automl.y_train, logger=job_log + ) evaluation_result, job_log = result.get("scores"), result.get("logger") logger = get_logger(__file__) job_log.write_to_logger(logger) assert mock_fit.call_count == automl.data_splitter.get_n_splits() assert mock_score.call_count == automl.data_splitter.get_n_splits() - assert evaluation_result.get('training_time') is not None - assert np.isnan(evaluation_result.get('cv_score_mean')) - pd.testing.assert_series_equal(evaluation_result.get('cv_scores'), pd.Series([np.nan] * 3)) + assert evaluation_result.get("training_time") is not None + assert np.isnan(evaluation_result.get("cv_score_mean")) + pd.testing.assert_series_equal( + evaluation_result.get("cv_scores"), pd.Series([np.nan] * 3) + ) for i in range(automl.data_splitter.get_n_splits()): - assert np.isnan(evaluation_result['cv_data'][i]['all_objective_scores']['Log Loss Binary']) - assert 'yeet' in caplog.text - - -@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets', side_effect=lambda y: y) -@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.automl.engine.engine_base.split_data') -def test_train_pipeline_trains_and_tunes_threshold(mock_split_data, mock_pipeline_fit, - mock_predict_proba, mock_encode_targets, mock_optimize, X_y_binary, - dummy_binary_pipeline_class): + assert np.isnan( + evaluation_result["cv_data"][i]["all_objective_scores"]["Log Loss Binary"] + ) + assert "yeet" in caplog.text + + +@patch("evalml.objectives.BinaryClassificationObjective.optimize_threshold") +@patch( + "evalml.pipelines.BinaryClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.automl.engine.engine_base.split_data") +def test_train_pipeline_trains_and_tunes_threshold( + mock_split_data, + mock_pipeline_fit, + mock_predict_proba, + mock_encode_targets, + mock_optimize, + X_y_binary, + dummy_binary_pipeline_class, +): X, y = X_y_binary - mock_split_data.return_value = split_data(X, y, "binary", test_size=0.2, random_seed=0) - - _ = train_pipeline(dummy_binary_pipeline_class({}), X, y, - optimize_thresholds=True, objective=LogLossBinary()) + mock_split_data.return_value = split_data( + X, y, "binary", test_size=0.2, random_seed=0 + ) + + _ = train_pipeline( + dummy_binary_pipeline_class({}), + X, + y, + optimize_thresholds=True, + objective=LogLossBinary(), + ) mock_pipeline_fit.assert_called_once() mock_optimize.assert_not_called() @@ -76,8 +125,9 @@ def test_train_pipeline_trains_and_tunes_threshold(mock_split_data, mock_pipelin mock_optimize.reset_mock() mock_split_data.reset_mock() - _ = train_pipeline(dummy_binary_pipeline_class({}), X, y, - optimize_thresholds=True, objective=F1()) + _ = train_pipeline( + dummy_binary_pipeline_class({}), X, y, optimize_thresholds=True, objective=F1() + ) mock_pipeline_fit.assert_called_once() mock_optimize.assert_called_once() mock_split_data.assert_called_once() diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 28886bc46a..78ec31b146 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -4,15 +4,12 @@ import pytest from skopt.space import Categorical, Integer, Real -from evalml.automl.automl_algorithm import ( - AutoMLAlgorithmException, - IterativeAlgorithm -) +from evalml.automl.automl_algorithm import AutoMLAlgorithmException, IterativeAlgorithm from evalml.model_family import ModelFamily from evalml.pipelines import ( BinaryClassificationPipeline, StackedEnsembleClassifier, - StackedEnsembleRegressor + StackedEnsembleRegressor, ) from evalml.pipelines.components import Estimator from evalml.pipelines.components.utils import get_estimators @@ -31,7 +28,9 @@ def test_iterative_algorithm_init(): assert algo.allowed_pipelines == [] -def test_iterative_algorithm_allowed_pipelines(logistic_regression_binary_pipeline_class): +def test_iterative_algorithm_allowed_pipelines( + logistic_regression_binary_pipeline_class, +): allowed_pipelines = [logistic_regression_binary_pipeline_class({})] algo = IterativeAlgorithm(allowed_pipelines=allowed_pipelines) assert algo.pipeline_number == 0 @@ -41,24 +40,35 @@ def test_iterative_algorithm_allowed_pipelines(logistic_regression_binary_pipeli @pytest.fixture def dummy_binary_pipeline_classes(): - def _method(hyperparameters=['default', 'other']): + def _method(hyperparameters=["default", "other"]): class MockEstimator(Estimator): name = "Mock Classifier" model_family = ModelFamily.RANDOM_FOREST supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] if isinstance(hyperparameters, (list, tuple, Real, Categorical, Integer)): - hyperparameter_ranges = {'dummy_parameter': hyperparameters} + hyperparameter_ranges = {"dummy_parameter": hyperparameters} else: - hyperparameter_ranges = {'dummy_parameter': [hyperparameters]} - - def __init__(self, dummy_parameter='default', n_jobs=-1, random_seed=0, **kwargs): - super().__init__(parameters={'dummy_parameter': dummy_parameter, **kwargs, - 'n_jobs': n_jobs}, - component_obj=None, random_seed=random_seed) + hyperparameter_ranges = {"dummy_parameter": [hyperparameters]} + + def __init__( + self, dummy_parameter="default", n_jobs=-1, random_seed=0, **kwargs + ): + super().__init__( + parameters={ + "dummy_parameter": dummy_parameter, + **kwargs, + "n_jobs": n_jobs, + }, + component_obj=None, + random_seed=random_seed, + ) + + return [ + BinaryClassificationPipeline([MockEstimator]), + BinaryClassificationPipeline([MockEstimator]), + BinaryClassificationPipeline([MockEstimator]), + ] - return [BinaryClassificationPipeline([MockEstimator]), - BinaryClassificationPipeline([MockEstimator]), - BinaryClassificationPipeline([MockEstimator])] return _method @@ -73,17 +83,25 @@ def test_iterative_algorithm_empty(dummy_binary_pipeline_classes): assert algo.pipeline_number == 0 assert algo.batch_number == 1 - with pytest.raises(AutoMLAlgorithmException, match='No results were reported from the first batch'): + with pytest.raises( + AutoMLAlgorithmException, match="No results were reported from the first batch" + ): algo.next_batch() assert algo.batch_number == 1 assert algo.pipeline_number == 0 @pytest.mark.parametrize("ensembling_value", [True, False]) -@patch('evalml.pipelines.components.ensemble.StackedEnsembleClassifier._stacking_estimator_class') -def test_iterative_algorithm_results(mock_stack, ensembling_value, dummy_binary_pipeline_classes): +@patch( + "evalml.pipelines.components.ensemble.StackedEnsembleClassifier._stacking_estimator_class" +) +def test_iterative_algorithm_results( + mock_stack, ensembling_value, dummy_binary_pipeline_classes +): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes() - algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, ensembling=ensembling_value) + algo = IterativeAlgorithm( + allowed_pipelines=dummy_binary_pipeline_classes, ensembling=ensembling_value + ) assert algo.pipeline_number == 0 assert algo.batch_number == 0 assert algo.allowed_pipelines == dummy_binary_pipeline_classes @@ -91,7 +109,9 @@ def test_iterative_algorithm_results(mock_stack, ensembling_value, dummy_binary_ # initial batch contains one of each pipeline, with default parameters next_batch = algo.next_batch() assert len(next_batch) == len(dummy_binary_pipeline_classes) - assert [p.__class__ for p in next_batch] == [p.__class__ for p in dummy_binary_pipeline_classes] + assert [p.__class__ for p in next_batch] == [ + p.__class__ for p in dummy_binary_pipeline_classes + ] assert algo.pipeline_number == len(dummy_binary_pipeline_classes) assert algo.batch_number == 1 assert all([p.parameters == p.default_parameters for p in next_batch]) @@ -109,10 +129,18 @@ def test_iterative_algorithm_results(mock_stack, ensembling_value, dummy_binary_ for _ in range(len(dummy_binary_pipeline_classes)): next_batch = algo.next_batch() assert len(next_batch) == algo.pipelines_per_batch - num_pipelines_classes = (len(dummy_binary_pipeline_classes) + 1) if ensembling_value else len(dummy_binary_pipeline_classes) - cls = dummy_binary_pipeline_classes[(algo.batch_number - 2) % num_pipelines_classes].__class__ + num_pipelines_classes = ( + (len(dummy_binary_pipeline_classes) + 1) + if ensembling_value + else len(dummy_binary_pipeline_classes) + ) + cls = dummy_binary_pipeline_classes[ + (algo.batch_number - 2) % num_pipelines_classes + ].__class__ assert [p.__class__ for p in next_batch] == [cls] * len(next_batch) - assert all([p.parameters['Mock Classifier']['n_jobs'] == -1 for p in next_batch]) + assert all( + [p.parameters["Mock Classifier"]["n_jobs"] == -1 for p in next_batch] + ) assert all((p.random_seed == algo.random_seed) for p in next_batch) assert algo.pipeline_number == last_pipeline_number + len(next_batch) last_pipeline_number = algo.pipeline_number @@ -123,7 +151,9 @@ def test_iterative_algorithm_results(mock_stack, ensembling_value, dummy_binary_ for score, pipeline in zip(scores, next_batch): algo.add_result(score, pipeline, {"id": algo.pipeline_number}) - assert any([p != dummy_binary_pipeline_classes[0].parameters for p in all_parameters]) + assert any( + [p != dummy_binary_pipeline_classes[0].parameters for p in all_parameters] + ) if ensembling_value: # check next batch is stacking ensemble batch @@ -139,23 +169,34 @@ def test_iterative_algorithm_results(mock_stack, ensembling_value, dummy_binary_ algo.add_result(score, pipeline, {"id": algo.pipeline_number}) assert pipeline.model_family == ModelFamily.ENSEMBLE assert pipeline.random_seed == algo.random_seed - stack_args = mock_stack.call_args[1]['estimators'] + stack_args = mock_stack.call_args[1]["estimators"] estimators_used_in_ensemble = [args[1] for args in stack_args] - random_seeds_the_same = [(estimator.pipeline.random_seed == algo.random_seed) - for estimator in estimators_used_in_ensemble] + random_seeds_the_same = [ + (estimator.pipeline.random_seed == algo.random_seed) + for estimator in estimators_used_in_ensemble + ] assert all(random_seeds_the_same) assert ModelFamily.ENSEMBLE not in algo._best_pipeline_info @pytest.mark.parametrize("ensembling_value", [True, False]) -@patch('evalml.pipelines.components.ensemble.StackedEnsembleClassifier._stacking_estimator_class') -def test_iterative_algorithm_passes_pipeline_params(mock_stack, ensembling_value, dummy_binary_pipeline_classes): +@patch( + "evalml.pipelines.components.ensemble.StackedEnsembleClassifier._stacking_estimator_class" +) +def test_iterative_algorithm_passes_pipeline_params( + mock_stack, ensembling_value, dummy_binary_pipeline_classes +): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes() - algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, ensembling=ensembling_value, - pipeline_params={'pipeline': {"gap": 2, "max_delay": 10}}) + algo = IterativeAlgorithm( + allowed_pipelines=dummy_binary_pipeline_classes, + ensembling=ensembling_value, + pipeline_params={"pipeline": {"gap": 2, "max_delay": 10}}, + ) next_batch = algo.next_batch() - assert all([p.parameters['pipeline'] == {"gap": 2, "max_delay": 10} for p in next_batch]) + assert all( + [p.parameters["pipeline"] == {"gap": 2, "max_delay": 10} for p in next_batch] + ) # the "best" score will be the 1st dummy pipeline scores = np.arange(0, len(next_batch)) @@ -165,20 +206,34 @@ def test_iterative_algorithm_passes_pipeline_params(mock_stack, ensembling_value for i in range(1, 5): for _ in range(len(dummy_binary_pipeline_classes)): next_batch = algo.next_batch() - assert all([p.parameters['pipeline'] == {"gap": 2, "max_delay": 10} for p in next_batch]) + assert all( + [ + p.parameters["pipeline"] == {"gap": 2, "max_delay": 10} + for p in next_batch + ] + ) scores = -np.arange(0, len(next_batch)) for score, pipeline in zip(scores, next_batch): algo.add_result(score, pipeline, {"id": algo.pipeline_number}) if ensembling_value: next_batch = algo.next_batch() - input_pipelines = next_batch[0].parameters['Stacked Ensemble Classifier']['input_pipelines'] - assert all([pl.parameters['pipeline'] == {"gap": 2, "max_delay": 10} for pl in input_pipelines]) + input_pipelines = next_batch[0].parameters["Stacked Ensemble Classifier"][ + "input_pipelines" + ] + assert all( + [ + pl.parameters["pipeline"] == {"gap": 2, "max_delay": 10} + for pl in input_pipelines + ] + ) def test_iterative_algorithm_passes_njobs(dummy_binary_pipeline_classes): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes() - algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, n_jobs=2, ensembling=False) + algo = IterativeAlgorithm( + allowed_pipelines=dummy_binary_pipeline_classes, n_jobs=2, ensembling=False + ) next_batch = algo.next_batch() # the "best" score will be the 1st dummy pipeline @@ -189,16 +244,23 @@ def test_iterative_algorithm_passes_njobs(dummy_binary_pipeline_classes): for i in range(1, 3): for _ in range(len(dummy_binary_pipeline_classes)): next_batch = algo.next_batch() - assert all([p.parameters['Mock Classifier']['n_jobs'] == 2 for p in next_batch]) + assert all( + [p.parameters["Mock Classifier"]["n_jobs"] == 2 for p in next_batch] + ) scores = -np.arange(0, len(next_batch)) for score, pipeline in zip(scores, next_batch): algo.add_result(score, pipeline, {"id": algo.pipeline_number}) @pytest.mark.parametrize("ensembling_value", [True, False]) -def test_iterative_algorithm_one_allowed_pipeline(ensembling_value, logistic_regression_binary_pipeline_class): +def test_iterative_algorithm_one_allowed_pipeline( + ensembling_value, logistic_regression_binary_pipeline_class +): # Checks that when len(allowed_pipeline) == 1, ensembling is not run, even if set to True - algo = IterativeAlgorithm(allowed_pipelines=[logistic_regression_binary_pipeline_class({})], ensembling=ensembling_value) + algo = IterativeAlgorithm( + allowed_pipelines=[logistic_regression_binary_pipeline_class({})], + ensembling=ensembling_value, + ) assert algo.pipeline_number == 0 assert algo.batch_number == 0 assert algo.allowed_pipelines == [logistic_regression_binary_pipeline_class({})] @@ -206,7 +268,9 @@ def test_iterative_algorithm_one_allowed_pipeline(ensembling_value, logistic_reg # initial batch contains one of each pipeline, with default parameters next_batch = algo.next_batch() assert len(next_batch) == 1 - assert [p.__class__ for p in next_batch] == [logistic_regression_binary_pipeline_class] * len(next_batch) + assert [p.__class__ for p in next_batch] == [ + logistic_regression_binary_pipeline_class + ] * len(next_batch) assert algo.pipeline_number == 1 assert algo.batch_number == 1 assert all([p.parameters == p.default_parameters for p in next_batch]) @@ -223,7 +287,9 @@ def test_iterative_algorithm_one_allowed_pipeline(ensembling_value, logistic_reg next_batch = algo.next_batch() assert len(next_batch) == algo.pipelines_per_batch assert all((p.random_seed == algo.random_seed) for p in next_batch) - assert [p.__class__ for p in next_batch] == [logistic_regression_binary_pipeline_class] * len(next_batch) + assert [p.__class__ for p in next_batch] == [ + logistic_regression_binary_pipeline_class + ] * len(next_batch) assert algo.pipeline_number == last_pipeline_number + len(next_batch) last_pipeline_number = algo.pipeline_number assert algo.batch_number == last_batch_number + 1 @@ -233,14 +299,26 @@ def test_iterative_algorithm_one_allowed_pipeline(ensembling_value, logistic_reg for score, pipeline in zip(scores, next_batch): algo.add_result(score, pipeline, {"id": algo.pipeline_number}) - assert any([p != logistic_regression_binary_pipeline_class({}).default_parameters for p in all_parameters]) + assert any( + [ + p != logistic_regression_binary_pipeline_class({}).default_parameters + for p in all_parameters + ] + ) @pytest.mark.parametrize("text_in_ensembling", [True, False]) @pytest.mark.parametrize("n_jobs", [-1, 0, 1, 2, 3]) -def test_iterative_algorithm_stacked_ensemble_n_jobs_binary(n_jobs, text_in_ensembling, dummy_binary_pipeline_classes): +def test_iterative_algorithm_stacked_ensemble_n_jobs_binary( + n_jobs, text_in_ensembling, dummy_binary_pipeline_classes +): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes() - algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, ensembling=True, text_in_ensembling=text_in_ensembling, n_jobs=n_jobs) + algo = IterativeAlgorithm( + allowed_pipelines=dummy_binary_pipeline_classes, + ensembling=True, + text_in_ensembling=text_in_ensembling, + n_jobs=n_jobs, + ) next_batch = algo.next_batch() seen_ensemble = False scores = range(0, len(next_batch)) @@ -253,16 +331,32 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_binary(n_jobs, text_in_ense if isinstance(pipeline.estimator, StackedEnsembleClassifier): seen_ensemble = True if text_in_ensembling: - assert pipeline.parameters['Stacked Ensemble Classifier']['n_jobs'] == 1 + assert ( + pipeline.parameters["Stacked Ensemble Classifier"]["n_jobs"] + == 1 + ) else: - assert pipeline.parameters['Stacked Ensemble Classifier']['n_jobs'] == n_jobs + assert ( + pipeline.parameters["Stacked Ensemble Classifier"]["n_jobs"] + == n_jobs + ) assert seen_ensemble @pytest.mark.parametrize("text_in_ensembling", [True, False]) @pytest.mark.parametrize("n_jobs", [-1, 0, 1, 2, 3]) -def test_iterative_algorithm_stacked_ensemble_n_jobs_regression(n_jobs, text_in_ensembling, linear_regression_pipeline_class): - algo = IterativeAlgorithm(allowed_pipelines=[linear_regression_pipeline_class({}), linear_regression_pipeline_class({})], ensembling=True, text_in_ensembling=text_in_ensembling, n_jobs=n_jobs) +def test_iterative_algorithm_stacked_ensemble_n_jobs_regression( + n_jobs, text_in_ensembling, linear_regression_pipeline_class +): + algo = IterativeAlgorithm( + allowed_pipelines=[ + linear_regression_pipeline_class({}), + linear_regression_pipeline_class({}), + ], + ensembling=True, + text_in_ensembling=text_in_ensembling, + n_jobs=n_jobs, + ) next_batch = algo.next_batch() seen_ensemble = False scores = range(0, len(next_batch)) @@ -275,28 +369,54 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_regression(n_jobs, text_in_ if isinstance(pipeline.estimator, StackedEnsembleRegressor): seen_ensemble = True if text_in_ensembling: - assert pipeline.parameters['Stacked Ensemble Regressor']['n_jobs'] == 1 + assert ( + pipeline.parameters["Stacked Ensemble Regressor"]["n_jobs"] == 1 + ) else: - assert pipeline.parameters['Stacked Ensemble Regressor']['n_jobs'] == n_jobs + assert ( + pipeline.parameters["Stacked Ensemble Regressor"]["n_jobs"] + == n_jobs + ) assert seen_ensemble -@pytest.mark.parametrize("parameters", [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Integer(2, 4), Real(2, 6)]) +@pytest.mark.parametrize( + "parameters", + [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Integer(2, 4), Real(2, 6)], +) def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_classes): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes(parameters) - algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, - random_seed=0, - pipeline_params={'pipeline': {"gap": 2, "max_delay": 10}, - 'Mock Classifier': {'dummy_parameter': parameters}}) + algo = IterativeAlgorithm( + allowed_pipelines=dummy_binary_pipeline_classes, + random_seed=0, + pipeline_params={ + "pipeline": {"gap": 2, "max_delay": 10}, + "Mock Classifier": {"dummy_parameter": parameters}, + }, + ) parameter = parameters if isinstance(parameter, (Categorical, Integer, Real)): - with pytest.raises(ValueError, match="Pipeline parameters should not contain skopt.Space variables"): + with pytest.raises( + ValueError, + match="Pipeline parameters should not contain skopt.Space variables", + ): algo.next_batch() else: next_batch = algo.next_batch() - assert all([p.parameters['pipeline'] == {"gap": 2, "max_delay": 10} for p in next_batch]) - assert all([p.parameters['Mock Classifier'] == {"dummy_parameter": parameter, "n_jobs": -1} for p in next_batch]) + assert all( + [ + p.parameters["pipeline"] == {"gap": 2, "max_delay": 10} + for p in next_batch + ] + ) + assert all( + [ + p.parameters["Mock Classifier"] + == {"dummy_parameter": parameter, "n_jobs": -1} + for p in next_batch + ] + ) scores = np.arange(0, len(next_batch)) for score, pipeline in zip(scores, next_batch): @@ -305,21 +425,39 @@ def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_c # make sure that future batches have the same parameter value for i in range(1, 5): next_batch = algo.next_batch() - assert all([p.parameters['Mock Classifier']['dummy_parameter'] == parameter for p in next_batch]) + assert all( + [ + p.parameters["Mock Classifier"]["dummy_parameter"] == parameter + for p in next_batch + ] + ) -@pytest.mark.parametrize("parameters,hyperparameters", [(1, Categorical([1, 3, 4])), (3, Integer(2, 4))]) -def test_iterative_algorithm_custom_hyperparameters(parameters, hyperparameters, dummy_binary_pipeline_classes): +@pytest.mark.parametrize( + "parameters,hyperparameters", [(1, Categorical([1, 3, 4])), (3, Integer(2, 4))] +) +def test_iterative_algorithm_custom_hyperparameters( + parameters, hyperparameters, dummy_binary_pipeline_classes +): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes(parameters) - algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, - random_seed=0, - pipeline_params={'Mock Classifier': {'dummy_parameter': parameters}}, - custom_hyperparameters={'Mock Classifier': {'dummy_parameter': hyperparameters}}) + algo = IterativeAlgorithm( + allowed_pipelines=dummy_binary_pipeline_classes, + random_seed=0, + pipeline_params={"Mock Classifier": {"dummy_parameter": parameters}}, + custom_hyperparameters={ + "Mock Classifier": {"dummy_parameter": hyperparameters} + }, + ) next_batch = algo.next_batch() - assert all([p.parameters['Mock Classifier']["n_jobs"] == -1 for p in next_batch]) - assert all([p.parameters['Mock Classifier']["dummy_parameter"] == parameters for p in next_batch]) + assert all([p.parameters["Mock Classifier"]["n_jobs"] == -1 for p in next_batch]) + assert all( + [ + p.parameters["Mock Classifier"]["dummy_parameter"] == parameters + for p in next_batch + ] + ) scores = np.arange(0, len(next_batch)) for score, pipeline in zip(scores, next_batch): @@ -330,10 +468,15 @@ def test_iterative_algorithm_custom_hyperparameters(parameters, hyperparameters, for i in range(1, 5): next_batch = algo.next_batch() for p in next_batch: - dummy = p.parameters['Mock Classifier']['dummy_parameter'] + dummy = p.parameters["Mock Classifier"]["dummy_parameter"] if dummy not in all_dummies: all_dummies.add(dummy) - assert all([p.parameters['Mock Classifier']['dummy_parameter'] in hyperparameters for p in next_batch]) + assert all( + [ + p.parameters["Mock Classifier"]["dummy_parameter"] in hyperparameters + for p in next_batch + ] + ) assert all_dummies == {1, 3, 4} if parameters == 1 else all_dummies == {2, 3, 4} @@ -342,36 +485,67 @@ class MockEstimator(Estimator): name = "Mock Classifier" model_family = ModelFamily.RANDOM_FOREST supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] - hyperparameter_ranges = {'dummy_int_parameter': Integer(1, 10), - 'dummy_categorical_parameter': Categorical(["random", "dummy", "test"]), - 'dummy_real_parameter': Real(0, 1)} - - def __init__(self, dummy_int_parameter=0, dummy_categorical_parameter='dummy', dummy_real_parameter=1.0, n_jobs=-1, random_seed=0, **kwargs): - super().__init__(parameters={'dummy_int_parameter': dummy_int_parameter, - 'dummy_categorical_parameter': dummy_categorical_parameter, - 'dummy_real_parameter': dummy_real_parameter, - **kwargs, 'n_jobs': n_jobs}, - component_obj=None, random_seed=random_seed) + hyperparameter_ranges = { + "dummy_int_parameter": Integer(1, 10), + "dummy_categorical_parameter": Categorical(["random", "dummy", "test"]), + "dummy_real_parameter": Real(0, 1), + } + + def __init__( + self, + dummy_int_parameter=0, + dummy_categorical_parameter="dummy", + dummy_real_parameter=1.0, + n_jobs=-1, + random_seed=0, + **kwargs + ): + super().__init__( + parameters={ + "dummy_int_parameter": dummy_int_parameter, + "dummy_categorical_parameter": dummy_categorical_parameter, + "dummy_real_parameter": dummy_real_parameter, + **kwargs, + "n_jobs": n_jobs, + }, + component_obj=None, + random_seed=random_seed, + ) pipeline = BinaryClassificationPipeline([MockEstimator]) - algo = IterativeAlgorithm(allowed_pipelines=[pipeline, pipeline, pipeline], - pipeline_params={'pipeline': {'date_index': "Date", "gap": 2, "max_delay": 10}}, - random_seed=0, - _frozen_pipeline_parameters={ - "Mock Classifier": { - 'dummy_int_parameter': 6, - 'dummy_categorical_parameter': "random", - 'dummy_real_parameter': 0.1 - }}) + algo = IterativeAlgorithm( + allowed_pipelines=[pipeline, pipeline, pipeline], + pipeline_params={"pipeline": {"date_index": "Date", "gap": 2, "max_delay": 10}}, + random_seed=0, + _frozen_pipeline_parameters={ + "Mock Classifier": { + "dummy_int_parameter": 6, + "dummy_categorical_parameter": "random", + "dummy_real_parameter": 0.1, + } + }, + ) next_batch = algo.next_batch() - assert all([p.parameters['pipeline'] == {'date_index': "Date", "gap": 2, "max_delay": 10} for p in next_batch]) - assert all([p.parameters['Mock Classifier'] == { - 'dummy_int_parameter': 6, - 'dummy_categorical_parameter': "random", - 'dummy_real_parameter': 0.1, - "n_jobs": -1 - } for p in next_batch]) + assert all( + [ + p.parameters["pipeline"] + == {"date_index": "Date", "gap": 2, "max_delay": 10} + for p in next_batch + ] + ) + assert all( + [ + p.parameters["Mock Classifier"] + == { + "dummy_int_parameter": 6, + "dummy_categorical_parameter": "random", + "dummy_real_parameter": 0.1, + "n_jobs": -1, + } + for p in next_batch + ] + ) scores = np.arange(0, len(next_batch)) for score, pipeline in zip(scores, next_batch): @@ -380,26 +554,47 @@ def __init__(self, dummy_int_parameter=0, dummy_categorical_parameter='dummy', d # make sure that future batches remain in the hyperparam range for i in range(1, 5): next_batch = algo.next_batch() - assert all([p.parameters['Mock Classifier'] == { - 'dummy_int_parameter': 6, - 'dummy_categorical_parameter': "random", - 'dummy_real_parameter': 0.1, - "n_jobs": -1 - } for p in next_batch]) + assert all( + [ + p.parameters["Mock Classifier"] + == { + "dummy_int_parameter": 6, + "dummy_categorical_parameter": "random", + "dummy_real_parameter": 0.1, + "n_jobs": -1, + } + for p in next_batch + ] + ) def test_iterative_algorithm_pipeline_params_kwargs(dummy_binary_pipeline_classes): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes() - algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, - pipeline_params={'Mock Classifier': {'dummy_parameter': "dummy", 'fake_param': 'fake'}}, - random_seed=0) + algo = IterativeAlgorithm( + allowed_pipelines=dummy_binary_pipeline_classes, + pipeline_params={ + "Mock Classifier": {"dummy_parameter": "dummy", "fake_param": "fake"} + }, + random_seed=0, + ) next_batch = algo.next_batch() - assert all([p.parameters['Mock Classifier'] == {"dummy_parameter": "dummy", "n_jobs": -1, "fake_param": "fake"} for p in next_batch]) - - -def test_iterative_algorithm_results_best_pipeline_info_id(dummy_binary_pipeline_classes, logistic_regression_binary_pipeline_class): - allowed_pipelines = [dummy_binary_pipeline_classes()[0], logistic_regression_binary_pipeline_class({})] + assert all( + [ + p.parameters["Mock Classifier"] + == {"dummy_parameter": "dummy", "n_jobs": -1, "fake_param": "fake"} + for p in next_batch + ] + ) + + +def test_iterative_algorithm_results_best_pipeline_info_id( + dummy_binary_pipeline_classes, logistic_regression_binary_pipeline_class +): + allowed_pipelines = [ + dummy_binary_pipeline_classes()[0], + logistic_regression_binary_pipeline_class({}), + ] algo = IterativeAlgorithm(allowed_pipelines=allowed_pipelines) # initial batch contains one of each pipeline, with default parameters @@ -407,19 +602,31 @@ def test_iterative_algorithm_results_best_pipeline_info_id(dummy_binary_pipeline scores = np.arange(0, len(next_batch)) for pipeline_num, (score, pipeline) in enumerate(zip(scores, next_batch)): algo.add_result(score, pipeline, {"id": algo.pipeline_number + pipeline_num}) - assert algo._best_pipeline_info[ModelFamily.RANDOM_FOREST]['id'] == 3 - assert algo._best_pipeline_info[ModelFamily.LINEAR_MODEL]['id'] == 2 + assert algo._best_pipeline_info[ModelFamily.RANDOM_FOREST]["id"] == 3 + assert algo._best_pipeline_info[ModelFamily.LINEAR_MODEL]["id"] == 2 for i in range(1, 3): next_batch = algo.next_batch() - scores = -np.arange(1, len(next_batch)) # Score always gets better with each pipeline + scores = -np.arange( + 1, len(next_batch) + ) # Score always gets better with each pipeline for pipeline_num, (score, pipeline) in enumerate(zip(scores, next_batch)): - algo.add_result(score, pipeline, {"id": algo.pipeline_number + pipeline_num}) - assert algo._best_pipeline_info[pipeline.model_family]['id'] == algo.pipeline_number + pipeline_num - - -@pytest.mark.parametrize("problem_type", [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -def test_iterative_algorithm_first_batch_order(problem_type, X_y_binary, has_minimal_dependencies): + algo.add_result( + score, pipeline, {"id": algo.pipeline_number + pipeline_num} + ) + assert ( + algo._best_pipeline_info[pipeline.model_family]["id"] + == algo.pipeline_number + pipeline_num + ) + + +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS], +) +def test_iterative_algorithm_first_batch_order( + problem_type, X_y_binary, has_minimal_dependencies +): X, y = X_y_binary estimators = get_estimators(problem_type, None) pipelines = [make_pipeline(X, y, e, problem_type) for e in estimators] @@ -430,39 +637,63 @@ def test_iterative_algorithm_first_batch_order(problem_type, X_y_binary, has_min estimators_in_first_batch = [p.estimator.name for p in next_batch] if problem_type == ProblemTypes.REGRESSION: - final_estimators = ['XGBoost Regressor', - 'LightGBM Regressor', - 'CatBoost Regressor'] + final_estimators = [ + "XGBoost Regressor", + "LightGBM Regressor", + "CatBoost Regressor", + ] else: - final_estimators = ['XGBoost Classifier', - 'LightGBM Classifier', - 'CatBoost Classifier'] + final_estimators = [ + "XGBoost Classifier", + "LightGBM Classifier", + "CatBoost Classifier", + ] if has_minimal_dependencies: final_estimators = [] if problem_type == ProblemTypes.REGRESSION: - assert estimators_in_first_batch == ['Linear Regressor', - 'Elastic Net Regressor', - 'Decision Tree Regressor', - 'Extra Trees Regressor', - 'Random Forest Regressor'] + final_estimators + assert ( + estimators_in_first_batch + == [ + "Linear Regressor", + "Elastic Net Regressor", + "Decision Tree Regressor", + "Extra Trees Regressor", + "Random Forest Regressor", + ] + + final_estimators + ) if problem_type == ProblemTypes.BINARY: - assert estimators_in_first_batch == ['Elastic Net Classifier', - 'Logistic Regression Classifier', - 'Decision Tree Classifier', - 'Extra Trees Classifier', - 'Random Forest Classifier'] + final_estimators + assert ( + estimators_in_first_batch + == [ + "Elastic Net Classifier", + "Logistic Regression Classifier", + "Decision Tree Classifier", + "Extra Trees Classifier", + "Random Forest Classifier", + ] + + final_estimators + ) if problem_type == ProblemTypes.MULTICLASS: - assert estimators_in_first_batch == ['Elastic Net Classifier', - 'Logistic Regression Classifier', - 'Decision Tree Classifier', - 'Extra Trees Classifier', - 'Random Forest Classifier'] + final_estimators - - -def test_iterative_algorithm_first_batch_order_param(X_y_binary, has_minimal_dependencies): + assert ( + estimators_in_first_batch + == [ + "Elastic Net Classifier", + "Logistic Regression Classifier", + "Decision Tree Classifier", + "Extra Trees Classifier", + "Random Forest Classifier", + ] + + final_estimators + ) + + +def test_iterative_algorithm_first_batch_order_param( + X_y_binary, has_minimal_dependencies +): X, y = X_y_binary - estimators = get_estimators('binary', None) - pipelines = [make_pipeline(X, y, e, 'binary') for e in estimators] + estimators = get_estimators("binary", None) + pipelines = [make_pipeline(X, y, e, "binary") for e in estimators] # put random forest first estimator_family_order = [ ModelFamily.RANDOM_FOREST, @@ -471,35 +702,56 @@ def test_iterative_algorithm_first_batch_order_param(X_y_binary, has_minimal_dep ModelFamily.EXTRA_TREES, ModelFamily.XGBOOST, ModelFamily.LIGHTGBM, - ModelFamily.CATBOOST + ModelFamily.CATBOOST, ] - algo = IterativeAlgorithm(allowed_pipelines=pipelines, _estimator_family_order=estimator_family_order) + algo = IterativeAlgorithm( + allowed_pipelines=pipelines, _estimator_family_order=estimator_family_order + ) next_batch = algo.next_batch() estimators_in_first_batch = [p.estimator.name for p in next_batch] - final_estimators = ['XGBoost Classifier', - 'LightGBM Classifier', - 'CatBoost Classifier'] + final_estimators = [ + "XGBoost Classifier", + "LightGBM Classifier", + "CatBoost Classifier", + ] if has_minimal_dependencies: final_estimators = [] - assert estimators_in_first_batch == ['Random Forest Classifier', - 'Elastic Net Classifier', - 'Logistic Regression Classifier', - 'Decision Tree Classifier', - 'Extra Trees Classifier'] + final_estimators - - -@pytest.mark.parametrize("sampler", ["Undersampler", "SMOTE Oversampler", "SMOTENC Oversampler", "SMOTEN Oversampler"]) + assert ( + estimators_in_first_batch + == [ + "Random Forest Classifier", + "Elastic Net Classifier", + "Logistic Regression Classifier", + "Decision Tree Classifier", + "Extra Trees Classifier", + ] + + final_estimators + ) + + +@pytest.mark.parametrize( + "sampler", + ["Undersampler", "SMOTE Oversampler", "SMOTENC Oversampler", "SMOTEN Oversampler"], +) @pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -def test_iterative_algorithm_sampling_params(problem_type, sampler, mock_imbalanced_data_X_y, has_minimal_dependencies): +def test_iterative_algorithm_sampling_params( + problem_type, sampler, mock_imbalanced_data_X_y, has_minimal_dependencies +): if has_minimal_dependencies and sampler != "Undersampler": - pytest.skip("Minimal dependencies, so we don't test the oversamplers for iterative algorithm") - X, y = mock_imbalanced_data_X_y(problem_type, "some", 'small') + pytest.skip( + "Minimal dependencies, so we don't test the oversamplers for iterative algorithm" + ) + X, y = mock_imbalanced_data_X_y(problem_type, "some", "small") estimators = get_estimators(problem_type, None) - pipelines = [make_pipeline(X, y, e, problem_type, sampler_name=sampler) for e in estimators] - algo = IterativeAlgorithm(allowed_pipelines=pipelines, - random_seed=0, - _frozen_pipeline_parameters={sampler: {"sampling_ratio": 0.5}}) + pipelines = [ + make_pipeline(X, y, e, problem_type, sampler_name=sampler) for e in estimators + ] + algo = IterativeAlgorithm( + allowed_pipelines=pipelines, + random_seed=0, + _frozen_pipeline_parameters={sampler: {"sampling_ratio": 0.5}}, + ) next_batch = algo.next_batch() for p in next_batch: for component in p._component_graph: diff --git a/evalml/tests/automl_tests/test_pipeline_search_plots.py b/evalml/tests/automl_tests/test_pipeline_search_plots.py index 6bde4aa357..cdf84a2ecb 100644 --- a/evalml/tests/automl_tests/test_pipeline_search_plots.py +++ b/evalml/tests/automl_tests/test_pipeline_search_plots.py @@ -7,41 +7,36 @@ def test_search_iteration_plot_class(): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) class MockObjective: def __init__(self): - self.name = 'Test Objective' + self.name = "Test Objective" self.greater_is_better = True class MockResults: def __init__(self): self.objective = MockObjective() self.results = { - 'pipeline_results': { - 2: { - "mean_cv_score": 0.50 - }, - 0: { - "mean_cv_score": 0.60 - }, - 1: { - "mean_cv_score": 0.75 - }, + "pipeline_results": { + 2: {"mean_cv_score": 0.50}, + 0: {"mean_cv_score": 0.60}, + 1: {"mean_cv_score": 0.75}, }, - 'search_order': [1, 2, 0] + "search_order": [1, 2, 0], } - self.rankings = pd.DataFrame({ - "mean_cv_score": [0.75, 0.60, 0.50] - }) + self.rankings = pd.DataFrame({"mean_cv_score": [0.75, 0.60, 0.50]}) mock_data = MockResults() plot = SearchIterationPlot(mock_data) # Check best score trace plot_data = plot.best_score_by_iter_fig.data[0] - x = list(plot_data['x']) - y = list(plot_data['y']) + x = list(plot_data["x"]) + y = list(plot_data["y"]) assert isinstance(plot, SearchIterationPlot) assert x == [0, 1, 2] @@ -49,25 +44,28 @@ def __init__(self): # Check current score trace plot_data = plot.best_score_by_iter_fig.data[1] - x = list(plot_data['x']) - y = list(plot_data['y']) + x = list(plot_data["x"]) + y = list(plot_data["y"]) assert isinstance(plot, SearchIterationPlot) assert x == [0, 1, 2] assert y == [0.60, 0.75, 0.50] -@patch('evalml.automl.pipeline_search_plots.jupyter_check') -@patch('evalml.automl.pipeline_search_plots.import_or_raise') +@patch("evalml.automl.pipeline_search_plots.jupyter_check") +@patch("evalml.automl.pipeline_search_plots.import_or_raise") def test_jupyter(import_check, jupyter_check): mock_data = MagicMock() - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) jupyter_check.return_value = True with pytest.warns(None) as graph_valid: SearchIterationPlot(mock_data) assert len(graph_valid) == 0 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) jupyter_check.return_value = False with pytest.warns(None) as graph_valid: diff --git a/evalml/tests/automl_tests/test_search.py b/evalml/tests/automl_tests/test_search.py index bf9bc2e6d5..560531f7ff 100644 --- a/evalml/tests/automl_tests/test_search.py +++ b/evalml/tests/automl_tests/test_search.py @@ -6,42 +6,52 @@ from evalml.utils import infer_feature_types -@patch('evalml.data_checks.default_data_checks.DefaultDataChecks.validate') -@patch('evalml.automl.AutoMLSearch.search') +@patch("evalml.data_checks.default_data_checks.DefaultDataChecks.validate") +@patch("evalml.automl.AutoMLSearch.search") def test_search(mock_automl_search, mock_data_checks_validate, X_y_binary): X, y = X_y_binary # this doesn't exactly match the data check results schema but its enough to trigger the error in search() - data_check_results_expected = {'warnings': ['Warning 1', 'Warning 2']} + data_check_results_expected = {"warnings": ["Warning 1", "Warning 2"]} mock_data_checks_validate.return_value = data_check_results_expected - automl, data_check_results = search(X_train=X, y_train=y, problem_type='binary') + automl, data_check_results = search(X_train=X, y_train=y, problem_type="binary") assert isinstance(automl, AutoMLSearch) assert data_check_results is data_check_results_expected mock_data_checks_validate.assert_called_once() - data, target = mock_data_checks_validate.call_args[0][0], mock_data_checks_validate.call_args[1]['y'] + data, target = ( + mock_data_checks_validate.call_args[0][0], + mock_data_checks_validate.call_args[1]["y"], + ) pd.testing.assert_frame_equal(data, infer_feature_types(X)) pd.testing.assert_series_equal(target, infer_feature_types(y)) mock_automl_search.assert_called_once() -@patch('evalml.data_checks.default_data_checks.DefaultDataChecks.validate') -@patch('evalml.automl.AutoMLSearch.search') -def test_search_data_check_error(mock_automl_search, mock_data_checks_validate, X_y_binary): +@patch("evalml.data_checks.default_data_checks.DefaultDataChecks.validate") +@patch("evalml.automl.AutoMLSearch.search") +def test_search_data_check_error( + mock_automl_search, mock_data_checks_validate, X_y_binary +): X, y = X_y_binary # this doesn't exactly match the data check results schema but its enough to trigger the error in search() - data_check_results_expected = {'errors': ['Error 1', 'Error 2']} + data_check_results_expected = {"errors": ["Error 1", "Error 2"]} mock_data_checks_validate.return_value = data_check_results_expected - automl, data_check_results = search(X_train=X, y_train=y, problem_type='binary') + automl, data_check_results = search(X_train=X, y_train=y, problem_type="binary") assert automl is None assert data_check_results == data_check_results_expected mock_data_checks_validate.assert_called_once() - data, target = mock_data_checks_validate.call_args[0][0], mock_data_checks_validate.call_args[1]['y'] + data, target = ( + mock_data_checks_validate.call_args[0][0], + mock_data_checks_validate.call_args[1]["y"], + ) pd.testing.assert_frame_equal(data, infer_feature_types(X)) pd.testing.assert_series_equal(target, infer_feature_types(y)) -@patch('evalml.data_checks.default_data_checks.DefaultDataChecks.validate') -@patch('evalml.automl.AutoMLSearch.search') +@patch("evalml.data_checks.default_data_checks.DefaultDataChecks.validate") +@patch("evalml.automl.AutoMLSearch.search") def test_search_kwargs(mock_automl_search, mock_data_checks_validate, X_y_binary): X, y = X_y_binary - automl, data_check_results = search(X_train=X, y_train=y, problem_type='binary', max_iterations=42) + automl, data_check_results = search( + X_train=X, y_train=y, problem_type="binary", max_iterations=42 + ) assert automl.max_iterations == 42 diff --git a/evalml/tests/automl_tests/test_time_series_split.py b/evalml/tests/automl_tests/test_time_series_split.py index dbdbe2144a..1ec6a55ad4 100644 --- a/evalml/tests/automl_tests/test_time_series_split.py +++ b/evalml/tests/automl_tests/test_time_series_split.py @@ -8,10 +8,16 @@ def test_time_series_split_init(): ts_split = TimeSeriesSplit(gap=3, max_delay=4, n_splits=5, date_index=None) assert ts_split.get_n_splits() == 5 - with pytest.raises(ValueError, match="Both X and y cannot be None or empty in TimeSeriesSplit.split"): + with pytest.raises( + ValueError, + match="Both X and y cannot be None or empty in TimeSeriesSplit.split", + ): _ = list(ts_split.split(X=None, y=None)) - with pytest.raises(ValueError, match="Both X and y cannot be None or empty in TimeSeriesSplit.split"): + with pytest.raises( + ValueError, + match="Both X and y cannot be None or empty in TimeSeriesSplit.split", + ): _ = list(ts_split.split(X=pd.DataFrame(), y=pd.Series([]))) @@ -20,13 +26,26 @@ def test_time_series_split_n_splits_too_big(): X = pd.DataFrame({"features": range(15)}) # Each split would have 15 // 5 = 3 data points. However, this is smaller than the number of data_points required # for max_delay and gap - with pytest.raises(ValueError, match="Please use a smaller number of splits or collect more data."): + with pytest.raises( + ValueError, match="Please use a smaller number of splits or collect more data." + ): list(splitter.split(X)) -@pytest.mark.parametrize("max_delay,gap,date_index", [(0, 0, "Date"), (1, 0, None), (2, 0, "Date"), - (0, 3, None), (1, 1, "Date"), (4, 2, None)]) -@pytest.mark.parametrize("X_none,y_none", [(False, False), (True, False), (False, True)]) +@pytest.mark.parametrize( + "max_delay,gap,date_index", + [ + (0, 0, "Date"), + (1, 0, None), + (2, 0, "Date"), + (0, 3, None), + (1, 1, "Date"), + (4, 2, None), + ], +) +@pytest.mark.parametrize( + "X_none,y_none", [(False, False), (True, False), (False, True)] +) def test_time_series_split(max_delay, gap, date_index, X_none, y_none): X = pd.DataFrame({"features": range(1, 32)}) y = pd.Series(range(1, 32)) @@ -39,12 +58,25 @@ def test_time_series_split(max_delay, gap, date_index, X_none, y_none): else: X.index = pd.date_range("2020-10-01", "2020-10-31") - answer = [(pd.date_range("2020-10-01", f"2020-10-{10 + gap}"), pd.date_range(f"2020-10-{11 - max_delay}", f"2020-10-{17 + gap}")), - (pd.date_range("2020-10-01", f"2020-10-{17 + gap}"), pd.date_range(f"2020-10-{18 - max_delay}", f"2020-10-{24 + gap}")), - (pd.date_range("2020-10-01", f"2020-10-{24 + gap}"), pd.date_range(f"2020-10-{25 - max_delay}", "2020-10-31"))] - answer_dt = [(pd.Index(range(10 + gap)), pd.Index(range(10 - max_delay, 17 + gap))), - (pd.Index(range(17 + gap)), pd.Index(range(17 - max_delay, 24 + gap))), - (pd.Index(range(24 + gap)), pd.Index(range(24 - max_delay, 31)))] + answer = [ + ( + pd.date_range("2020-10-01", f"2020-10-{10 + gap}"), + pd.date_range(f"2020-10-{11 - max_delay}", f"2020-10-{17 + gap}"), + ), + ( + pd.date_range("2020-10-01", f"2020-10-{17 + gap}"), + pd.date_range(f"2020-10-{18 - max_delay}", f"2020-10-{24 + gap}"), + ), + ( + pd.date_range("2020-10-01", f"2020-10-{24 + gap}"), + pd.date_range(f"2020-10-{25 - max_delay}", "2020-10-31"), + ), + ] + answer_dt = [ + (pd.Index(range(10 + gap)), pd.Index(range(10 - max_delay, 17 + gap))), + (pd.Index(range(17 + gap)), pd.Index(range(17 - max_delay, 24 + gap))), + (pd.Index(range(24 + gap)), pd.Index(range(24 - max_delay, 31))), + ] if X_none: X = None diff --git a/evalml/tests/component_tests/test_arima_regressor.py b/evalml/tests/component_tests/test_arima_regressor.py index 9aaa89a27e..f0e6982850 100644 --- a/evalml/tests/component_tests/test_arima_regressor.py +++ b/evalml/tests/component_tests/test_arima_regressor.py @@ -9,8 +9,12 @@ from evalml.pipelines.components import ARIMARegressor from evalml.problem_types import ProblemTypes -sktime_arima = importorskip('sktime.forecasting.arima', reason='Skipping test because sktime not installed') -forecasting = importorskip('sktime.forecasting.base', reason='Skipping test because sktime not installed') +sktime_arima = importorskip( + "sktime.forecasting.arima", reason="Skipping test because sktime not installed" +) +forecasting = importorskip( + "sktime.forecasting.base", reason="Skipping test because sktime not installed" +) def test_model_family(): @@ -18,7 +22,9 @@ def test_model_family(): def test_problem_types(): - assert set(ARIMARegressor.supported_problem_types) == {ProblemTypes.TIME_SERIES_REGRESSION} + assert set(ARIMARegressor.supported_problem_types) == { + ProblemTypes.TIME_SERIES_REGRESSION + } def test_model_instance(ts_data): @@ -47,8 +53,8 @@ def test_match_indices(ts_data): assert X_.index.equals(date_index) -@pytest.mark.parametrize('predict', [True, False]) -@pytest.mark.parametrize('dates_shape', [0, 1, 2]) +@pytest.mark.parametrize("predict", [True, False]) +@pytest.mark.parametrize("dates_shape", [0, 1, 2]) def test_format_dates(predict, dates_shape, ts_data): X, y = ts_data date_index = pd.date_range("2020-10-02", "2020-11-01") @@ -65,7 +71,7 @@ def test_format_dates(predict, dates_shape, ts_data): assert X_.index.equals(y_.index) assert isinstance(fh_, forecasting.ForecastingHorizon) elif dates_shape == 2: - with pytest.raises(ValueError, match='Found 2 columns'): + with pytest.raises(ValueError, match="Found 2 columns"): clf._format_dates(date_index, X, y, True) else: if dates_shape != 2: @@ -73,7 +79,7 @@ def test_format_dates(predict, dates_shape, ts_data): assert X_.index.equals(y_.index) assert _ is None elif dates_shape == 2: - with pytest.raises(ValueError, match='Found 2 columns'): + with pytest.raises(ValueError, match="Found 2 columns"): clf._format_dates(date_index, X, y, False) @@ -93,7 +99,7 @@ def test_fit_predict_ts_with_datetime_in_X_column(ts_data_seasonal): m_clf.fit(X=X[:250], y=y[:250]) y_pred = m_clf.predict(X=X[250:]) - X['Sample'] = pd.date_range(start='1/1/2016', periods=500) + X["Sample"] = pd.date_range(start="1/1/2016", periods=500) dt_clf = ARIMARegressor(d=None) dt_clf.fit(X=X[:250], y=y[:250]) @@ -120,7 +126,7 @@ def test_fit_predict_ts_with_only_datetime_column_in_X(ts_data_seasonal): m_clf.fit(X=X[:250], y=y[:250]) y_pred = m_clf.predict(X=X[250:]) - assert (y_pred_sk.to_period('D') == y_pred).all() + assert (y_pred_sk.to_period("D") == y_pred).all() def test_fit_predict_ts_with_X_and_y_index_out_of_sample(ts_data_seasonal): @@ -138,12 +144,18 @@ def test_fit_predict_ts_with_X_and_y_index_out_of_sample(ts_data_seasonal): m_clf.fit(X=X[:250], y=y[:250]) y_pred = m_clf.predict(X=X[250:]) - assert (y_pred_sk.to_period('D') == y_pred).all() + assert (y_pred_sk.to_period("D") == y_pred).all() -@patch('evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._format_dates') -@patch('evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._get_dates') -def test_fit_predict_ts_with_X_and_y_index(mock_get_dates, mock_format_dates, ts_data_seasonal): +@patch( + "evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._format_dates" +) +@patch( + "evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._get_dates" +) +def test_fit_predict_ts_with_X_and_y_index( + mock_get_dates, mock_format_dates, ts_data_seasonal +): X, y = ts_data_seasonal assert isinstance(X.index, pd.DatetimeIndex) assert isinstance(y.index, pd.DatetimeIndex) @@ -165,9 +177,15 @@ def test_fit_predict_ts_with_X_and_y_index(mock_get_dates, mock_format_dates, ts assert (y_pred_sk == y_pred).all() -@patch('evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._format_dates') -@patch('evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._get_dates') -def test_fit_predict_ts_with_X_not_y_index(mock_get_dates, mock_format_dates, ts_data_seasonal): +@patch( + "evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._format_dates" +) +@patch( + "evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._get_dates" +) +def test_fit_predict_ts_with_X_not_y_index( + mock_get_dates, mock_format_dates, ts_data_seasonal +): X, y = ts_data_seasonal assert isinstance(X.index, pd.DatetimeIndex) assert isinstance(y.index, pd.DatetimeIndex) @@ -192,9 +210,15 @@ def test_fit_predict_ts_with_X_not_y_index(mock_get_dates, mock_format_dates, ts assert (y_pred_sk == y_pred).all() -@patch('evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._format_dates') -@patch('evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._get_dates') -def test_fit_predict_ts_with_y_not_X_index(mock_get_dates, mock_format_dates, ts_data_seasonal): +@patch( + "evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._format_dates" +) +@patch( + "evalml.pipelines.components.estimators.regressors.arima_regressor.ARIMARegressor._get_dates" +) +def test_fit_predict_ts_with_y_not_X_index( + mock_get_dates, mock_format_dates, ts_data_seasonal +): X, y = ts_data_seasonal mock_get_dates.return_value = (y.index, X) @@ -223,11 +247,13 @@ def test_predict_ts_without_X_error(ts_data): m_clf = ARIMARegressor() clf_ = m_clf.fit(X=X, y=y) - with pytest.raises(ValueError, match='If X was passed to the fit method of the ARIMARegressor'): + with pytest.raises( + ValueError, match="If X was passed to the fit method of the ARIMARegressor" + ): clf_.predict(y=y) -@patch('sktime.forecasting.base._sktime._SktimeForecaster.predict') +@patch("sktime.forecasting.base._sktime._SktimeForecaster.predict") def test_predict_ts_X_error(mock_sktime_predict, ts_data): X, y = ts_data @@ -235,7 +261,7 @@ def test_predict_ts_X_error(mock_sktime_predict, ts_data): m_clf = ARIMARegressor() clf_ = m_clf.fit(X=X, y=y) - with pytest.raises(ValueError, match='Sktime value error'): + with pytest.raises(ValueError, match="Sktime value error"): clf_.predict(y=y) @@ -247,7 +273,10 @@ def test_fit_ts_with_not_X_not_y_index(ts_data): assert not isinstance(X.index, pd.DatetimeIndex) clf = ARIMARegressor() - with pytest.raises(ValueError, match="If not it will look for the datetime column in the index of X or y."): + with pytest.raises( + ValueError, + match="If not it will look for the datetime column in the index of X or y.", + ): clf.fit(X=X, y=y) @@ -258,7 +287,10 @@ def test_predict_ts_with_not_X_index(ts_data): m_clf = ARIMARegressor() clf_ = m_clf.fit(X=X, y=y) - with pytest.raises(ValueError, match="If not it will look for the datetime column in the index of X."): + with pytest.raises( + ValueError, + match="If not it will look for the datetime column in the index of X.", + ): clf_.predict(X) @@ -283,7 +315,7 @@ def test_fit_predict_ts_no_X_out_of_sample(ts_data_seasonal): m_clf.fit(X=None, y=y[:250]) y_pred = m_clf.predict(X=None, y=y[250:]) - assert (y_pred_sk.to_period('D') == y_pred).all() + assert (y_pred_sk.to_period("D") == y_pred).all() @pytest.mark.parametrize("X_none", [True, False]) @@ -302,7 +334,7 @@ def test_fit_predict_date_index_named_out_of_sample(X_none, ts_data_seasonal): X = X.reset_index() assert not isinstance(X.index, pd.DatetimeIndex) - m_clf = ARIMARegressor(date_index='index', d=None) + m_clf = ARIMARegressor(date_index="index", d=None) if X_none: m_clf.fit(X=None, y=y[:250]) y_pred = m_clf.predict(X=None, y=y[250:]) @@ -310,4 +342,4 @@ def test_fit_predict_date_index_named_out_of_sample(X_none, ts_data_seasonal): m_clf.fit(X=X[:250], y=y[:250]) y_pred = m_clf.predict(X=X[250:], y=y[250:]) - assert (y_pred_sk.to_period('D') == y_pred).all() + assert (y_pred_sk.to_period("D") == y_pred).all() diff --git a/evalml/tests/component_tests/test_baseline_classifier.py b/evalml/tests/component_tests/test_baseline_classifier.py index ebe893dcb7..5e341affcf 100644 --- a/evalml/tests/component_tests/test_baseline_classifier.py +++ b/evalml/tests/component_tests/test_baseline_classifier.py @@ -26,9 +26,9 @@ def test_baseline_y_is_None(X_y_binary): BaselineClassifier().fit(X, y=None) -@pytest.mark.parametrize('data_type', ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_baseline_binary_mode(data_type, make_data_type): - X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]}) + X = pd.DataFrame({"one": [1, 2, 3, 4], "two": [2, 3, 4, 5], "three": [1, 2, 3, 4]}) y = pd.Series([10, 11, 10, 10]) X = make_data_type(data_type, X) y = make_data_type(data_type, y) @@ -43,7 +43,9 @@ def test_baseline_binary_mode(data_type, make_data_type): predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (X.shape[0], 2) - expected_predictions_proba = pd.DataFrame({10: [1., 1., 1., 1.], 11: [0., 0., 0., 0.]}) + expected_predictions_proba = pd.DataFrame( + {10: [1.0, 1.0, 1.0, 1.0], 11: [0.0, 0.0, 0.0, 0.0]} + ) assert_frame_equal(expected_predictions_proba, predicted_proba) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) @@ -56,13 +58,17 @@ def test_baseline_binary_random(X_y_binary): clf.fit(X, y) assert clf.classes_ == [0, 1] - expected_predictions = pd.Series(get_random_state(0).choice(np.unique(y), len(X)), dtype="int64") + expected_predictions = pd.Series( + get_random_state(0).choice(np.unique(y), len(X)), dtype="int64" + ) predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) - expected_predictions_proba = pd.DataFrame(np.array([[0.5 for i in range(len(values))]] * len(X))) + expected_predictions_proba = pd.DataFrame( + np.array([[0.5 for i in range(len(values))]] * len(X)) + ) assert_frame_equal(expected_predictions_proba, predicted_proba) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) @@ -78,20 +84,24 @@ def test_baseline_binary_random_weighted(X_y_binary): clf.fit(X, y) assert clf.classes_ == [0, 1] - expected_predictions = pd.Series(get_random_state(0).choice(np.unique(y), len(X), p=percent_freq), dtype="int64") + expected_predictions = pd.Series( + get_random_state(0).choice(np.unique(y), len(X), p=percent_freq), dtype="int64" + ) predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) - expected_predictions_proba = pd.DataFrame(np.array([[percent_freq[i] for i in range(len(values))]] * len(X))) + expected_predictions_proba = pd.DataFrame( + np.array([[percent_freq[i] for i in range(len(values))]] * len(X)) + ) assert_frame_equal(expected_predictions_proba, predicted_proba) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) def test_baseline_multiclass_mode(): - X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]}) + X = pd.DataFrame({"one": [1, 2, 3, 4], "two": [2, 3, 4, 5], "three": [1, 2, 3, 4]}) y = pd.Series([10, 12, 11, 11]) clf = BaselineClassifier(strategy="mode") clf.fit(X, y) @@ -103,7 +113,9 @@ def test_baseline_multiclass_mode(): predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) - expected_predictions_proba = pd.DataFrame({10: [0., 0., 0., 0.], 11: [1., 1., 1., 1.], 12: [0., 0., 0., 0.]}) + expected_predictions_proba = pd.DataFrame( + {10: [0.0, 0.0, 0.0, 0.0], 11: [1.0, 1.0, 1.0, 1.0], 12: [0.0, 0.0, 0.0, 0.0]} + ) assert_frame_equal(expected_predictions_proba, predicted_proba) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) @@ -116,13 +128,18 @@ def test_baseline_multiclass_random(X_y_multi): clf.fit(X, y) assert clf.classes_ == [0, 1, 2] - expected_predictions = pd.Series(get_random_state(0).choice(np.unique(y), len(X)), dtype="int64") + expected_predictions = pd.Series( + get_random_state(0).choice(np.unique(y), len(X)), dtype="int64" + ) predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) - assert_frame_equal(pd.DataFrame(np.array([[1. / 3 for i in range(len(values))]] * len(X))), predicted_proba) + assert_frame_equal( + pd.DataFrame(np.array([[1.0 / 3 for i in range(len(values))]] * len(X))), + predicted_proba, + ) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) @@ -135,13 +152,20 @@ def test_baseline_multiclass_random_weighted(X_y_multi): clf.fit(X, y) assert clf.classes_ == [0, 1, 2] - expected_predictions = pd.Series(get_random_state(0).choice(np.unique(y), len(X), p=percent_freq), dtype="int64") + expected_predictions = pd.Series( + get_random_state(0).choice(np.unique(y), len(X), p=percent_freq), dtype="int64" + ) predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) - assert_frame_equal(pd.DataFrame(np.array([[percent_freq[i] for i in range(len(values))]] * len(X))), predicted_proba) + assert_frame_equal( + pd.DataFrame( + np.array([[percent_freq[i] for i in range(len(values))]] * len(X)) + ), + predicted_proba, + ) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) @@ -159,6 +183,9 @@ def test_baseline_no_mode(): predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) - assert_frame_equal(pd.DataFrame(np.array([[1.0 if i == 0 else 0.0 for i in range(3)]] * len(X))), predicted_proba) + assert_frame_equal( + pd.DataFrame(np.array([[1.0 if i == 0 else 0.0 for i in range(3)]] * len(X))), + predicted_proba, + ) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1])) diff --git a/evalml/tests/component_tests/test_catboost_classifier.py b/evalml/tests/component_tests/test_catboost_classifier.py index 1ef6fd41a8..c003891c36 100644 --- a/evalml/tests/component_tests/test_catboost_classifier.py +++ b/evalml/tests/component_tests/test_catboost_classifier.py @@ -4,7 +4,7 @@ from evalml.pipelines.components import CatBoostClassifier from evalml.utils import SEED_BOUNDS -importorskip('catboost', reason='Skipping test because catboost not installed') +importorskip("catboost", reason="Skipping test because catboost not installed") def test_catboost_classifier_random_seed_bounds_seed(X_y_binary): @@ -13,8 +13,12 @@ def test_catboost_classifier_random_seed_bounds_seed(X_y_binary): col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) y = pd.Series(y) - clf = CatBoostClassifier(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound) + clf = CatBoostClassifier( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound + ) clf.fit(X, y) - clf = CatBoostClassifier(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound) + clf = CatBoostClassifier( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound + ) fitted = clf.fit(X, y) assert isinstance(fitted, CatBoostClassifier) diff --git a/evalml/tests/component_tests/test_catboost_regressor.py b/evalml/tests/component_tests/test_catboost_regressor.py index 5b8813174d..8bfe7f1deb 100644 --- a/evalml/tests/component_tests/test_catboost_regressor.py +++ b/evalml/tests/component_tests/test_catboost_regressor.py @@ -4,7 +4,7 @@ from evalml.pipelines.components import CatBoostRegressor from evalml.utils import SEED_BOUNDS -importorskip('catboost', reason='Skipping test because catboost not installed') +importorskip("catboost", reason="Skipping test because catboost not installed") def test_catboost_regressor_random_seed_bounds_seed(X_y_regression): @@ -13,8 +13,12 @@ def test_catboost_regressor_random_seed_bounds_seed(X_y_regression): col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) y = pd.Series(y) - clf = CatBoostRegressor(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound) + clf = CatBoostRegressor( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound + ) clf.fit(X, y) - clf = CatBoostRegressor(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound) + clf = CatBoostRegressor( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound + ) fitted = clf.fit(X, y) assert isinstance(fitted, CatBoostRegressor) diff --git a/evalml/tests/component_tests/test_column_selector_transformers.py b/evalml/tests/component_tests/test_column_selector_transformers.py index 51efaf77bc..b911fda145 100644 --- a/evalml/tests/component_tests/test_column_selector_transformers.py +++ b/evalml/tests/component_tests/test_column_selector_transformers.py @@ -38,18 +38,31 @@ def test_column_transformer_empty_X(class_to_test): assert transformer.transform(X).empty -@pytest.mark.parametrize("class_to_test,checking_functions", - [(DropColumns, [lambda X, X_t: X_t.equals(X.astype("int64")), - lambda X, X_t: X_t.equals(X.astype("int64")), - lambda X, X_t: X_t.equals(X.drop(columns=["one"]).astype("int64")), - lambda X, X_t: X_t.empty]), - (SelectColumns, [lambda X, X_t: X_t.empty, - lambda X, X_t: X_t.empty, - lambda X, X_t: X_t.equals(X[["one"]].astype("int64")), - lambda X, X_t: X_t.equals(X.astype("int64"))]) - ]) +@pytest.mark.parametrize( + "class_to_test,checking_functions", + [ + ( + DropColumns, + [ + lambda X, X_t: X_t.equals(X.astype("int64")), + lambda X, X_t: X_t.equals(X.astype("int64")), + lambda X, X_t: X_t.equals(X.drop(columns=["one"]).astype("int64")), + lambda X, X_t: X_t.empty, + ], + ), + ( + SelectColumns, + [ + lambda X, X_t: X_t.empty, + lambda X, X_t: X_t.empty, + lambda X, X_t: X_t.equals(X[["one"]].astype("int64")), + lambda X, X_t: X_t.equals(X.astype("int64")), + ], + ), + ], +) def test_column_transformer_transform(class_to_test, checking_functions): - X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]}) + X = pd.DataFrame({"one": [1, 2, 3, 4], "two": [2, 3, 4, 5], "three": [1, 2, 3, 4]}) check1, check2, check3, check4 = checking_functions transformer = class_to_test(columns=None) @@ -65,16 +78,29 @@ def test_column_transformer_transform(class_to_test, checking_functions): assert check4(X, transformer.transform(X)) -@pytest.mark.parametrize("class_to_test,checking_functions", - [(DropColumns, [lambda X, X_t: X_t.equals(X.astype("int64")), - lambda X, X_t: X_t.equals(X.drop(columns=["one"]).astype("int64")), - lambda X, X_t: X_t.empty]), - (SelectColumns, [lambda X, X_t: X_t.empty, - lambda X, X_t: X_t.equals(X[["one"]].astype("int64")), - lambda X, X_t: X_t.equals(X.astype("int64"))]) - ]) +@pytest.mark.parametrize( + "class_to_test,checking_functions", + [ + ( + DropColumns, + [ + lambda X, X_t: X_t.equals(X.astype("int64")), + lambda X, X_t: X_t.equals(X.drop(columns=["one"]).astype("int64")), + lambda X, X_t: X_t.empty, + ], + ), + ( + SelectColumns, + [ + lambda X, X_t: X_t.empty, + lambda X, X_t: X_t.equals(X[["one"]].astype("int64")), + lambda X, X_t: X_t.equals(X.astype("int64")), + ], + ), + ], +) def test_column_transformer_fit_transform(class_to_test, checking_functions): - X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]}) + X = pd.DataFrame({"one": [1, 2, 3, 4], "two": [2, 3, 4, 5], "three": [1, 2, 3, 4]}) check1, check2, check3 = checking_functions assert check1(X, class_to_test(columns=[]).fit_transform(X)) @@ -86,7 +112,7 @@ def test_column_transformer_fit_transform(class_to_test, checking_functions): @pytest.mark.parametrize("class_to_test", [DropColumns, SelectColumns]) def test_drop_column_transformer_input_invalid_col_name(class_to_test): - X = pd.DataFrame({'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4]}) + X = pd.DataFrame({"one": [1, 2, 3, 4], "two": [2, 3, 4, 5], "three": [1, 2, 3, 4]}) transformer = class_to_test(columns=["not in data"]) with pytest.raises(ValueError, match="'not in data' not found in input data"): transformer.fit(X) @@ -105,14 +131,31 @@ def test_drop_column_transformer_input_invalid_col_name(class_to_test): transformer.fit_transform(X) -@pytest.mark.parametrize("class_to_test,answers", - [(DropColumns, [pd.DataFrame([[0, 2, 3], [4, 6, 7], [8, 10, 11]], columns=[0, 2, 3], dtype="int64"), - pd.DataFrame([[], [], []], dtype="Int64"), - pd.DataFrame(np.arange(12).reshape(3, 4), dtype="int64")]), - (SelectColumns, [pd.DataFrame([[1], [5], [9]], columns=[1], dtype="int64"), - pd.DataFrame(np.arange(12).reshape(3, 4), dtype="int64"), - pd.DataFrame([[], [], []], dtype="Int64")]) - ]) +@pytest.mark.parametrize( + "class_to_test,answers", + [ + ( + DropColumns, + [ + pd.DataFrame( + [[0, 2, 3], [4, 6, 7], [8, 10, 11]], + columns=[0, 2, 3], + dtype="int64", + ), + pd.DataFrame([[], [], []], dtype="Int64"), + pd.DataFrame(np.arange(12).reshape(3, 4), dtype="int64"), + ], + ), + ( + SelectColumns, + [ + pd.DataFrame([[1], [5], [9]], columns=[1], dtype="int64"), + pd.DataFrame(np.arange(12).reshape(3, 4), dtype="int64"), + pd.DataFrame([[], [], []], dtype="Int64"), + ], + ), + ], +) def test_column_transformer_int_col_names_np_array(class_to_test, answers): X = np.arange(12).reshape(3, 4) answer1, answer2, answer3 = answers diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py index d8aa7845cf..4e8dc4229c 100644 --- a/evalml/tests/component_tests/test_components.py +++ b/evalml/tests/component_tests/test_components.py @@ -13,7 +13,7 @@ from evalml.exceptions import ( ComponentNotYetFittedError, EnsembleMissingPipelinesError, - MethodPropertyNotFoundError + MethodPropertyNotFoundError, ) from evalml.model_family import ModelFamily from evalml.pipelines import BinaryClassificationPipeline, RegressionPipeline @@ -62,17 +62,17 @@ Transformer, Undersampler, XGBoostClassifier, - XGBoostRegressor + XGBoostRegressor, ) from evalml.pipelines.components.ensemble import ( StackedEnsembleClassifier, - StackedEnsembleRegressor + StackedEnsembleRegressor, ) from evalml.pipelines.components.utils import ( _all_estimators, _all_transformers, all_components, - generate_component_code + generate_component_code, ) from evalml.problem_types import ProblemTypes @@ -86,7 +86,7 @@ class MockComponent(ComponentBase): class MockEstimator(Estimator): name = "Mock Estimator" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] class MockTransformer(Transformer): name = "Mock Transformer" @@ -99,7 +99,7 @@ def test_estimator_needs_fitting_false(): class MockEstimatorNeedsFittingFalse(Estimator): name = "Mock Estimator Needs Fitting False" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] needs_fitting = False def predict(self, X): @@ -110,19 +110,19 @@ def predict(self, X): class MockFitComponent(ComponentBase): model_family = ModelFamily.NONE - name = 'Mock Fit Component' + name = "Mock Fit Component" def __init__(self, param_a=2, param_b=10, random_seed=0): - parameters = {'param_a': param_a, 'param_b': param_b} - super().__init__(parameters=parameters, - component_obj=None, - random_seed=0) + parameters = {"param_a": param_a, "param_b": param_b} + super().__init__(parameters=parameters, component_obj=None, random_seed=0) def fit(self, X, y=None): pass def predict(self, X): - return np.array([self.parameters['param_a'] * 2, self.parameters['param_b'] * 10]) + return np.array( + [self.parameters["param_a"] * 2, self.parameters["param_b"] * 10] + ) def test_init(test_classes): @@ -134,13 +134,22 @@ def test_init(test_classes): def test_describe(test_classes): MockComponent, MockEstimator, MockTransformer = test_classes - params = {'param_a': 'value_a', 'param_b': 123} + params = {"param_a": "value_a", "param_b": 123} component = MockComponent(parameters=params) - assert component.describe(return_dict=True) == {'name': 'Mock Component', 'parameters': params} + assert component.describe(return_dict=True) == { + "name": "Mock Component", + "parameters": params, + } estimator = MockEstimator(parameters=params) - assert estimator.describe(return_dict=True) == {'name': 'Mock Estimator', 'parameters': params} + assert estimator.describe(return_dict=True) == { + "name": "Mock Estimator", + "parameters": params, + } transformer = MockTransformer(parameters=params) - assert transformer.describe(return_dict=True) == {'name': 'Mock Transformer', 'parameters': params} + assert transformer.describe(return_dict=True) == { + "name": "Mock Transformer", + "parameters": params, + } def test_describe_component(): @@ -149,9 +158,13 @@ def test_describe_component(): simple_imputer = SimpleImputer("mean") column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)}) scaler = StandardScaler() - feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) - feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) - drop_col_transformer = DropColumns(columns=['col_one', 'col_two']) + feature_selection_clf = RFClassifierSelectFromModel( + n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf + ) + feature_selection_reg = RFRegressorSelectFromModel( + n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf + ) + drop_col_transformer = DropColumns(columns=["col_one", "col_two"]) drop_null_transformer = DropNullColumns() datetime = DateTimeFeaturizer() text_featurizer = TextFeaturizer() @@ -160,40 +173,139 @@ def test_describe_component(): lda = LinearDiscriminantAnalysis() ft = DFSTransformer() us = Undersampler() - assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error'}} - assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent", - 'categorical_fill_value': None, - 'numeric_impute_strategy': "mean", - 'numeric_fill_value': None}} - assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}} - assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}} - assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}} - assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} - assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} - assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}} - assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}} - assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component', - 'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'], - 'encode_as_categories': False, - 'date_index': None}} - assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}} - assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}} - assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}} - assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}} - assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}} - assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"sampling_ratio": 0.25, "sampling_ratio_dict": None, "min_samples": 100, "min_percentage": 0.1}} + assert enc.describe(return_dict=True) == { + "name": "One Hot Encoder", + "parameters": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", + }, + } + assert imputer.describe(return_dict=True) == { + "name": "Imputer", + "parameters": { + "categorical_impute_strategy": "most_frequent", + "categorical_fill_value": None, + "numeric_impute_strategy": "mean", + "numeric_fill_value": None, + }, + } + assert simple_imputer.describe(return_dict=True) == { + "name": "Simple Imputer", + "parameters": {"impute_strategy": "mean", "fill_value": None}, + } + assert column_imputer.describe(return_dict=True) == { + "name": "Per Column Imputer", + "parameters": { + "impute_strategies": {"a": "mean", "b": ("constant", 100)}, + "default_impute_strategy": "most_frequent", + }, + } + assert scaler.describe(return_dict=True) == { + "name": "Standard Scaler", + "parameters": {}, + } + assert feature_selection_clf.describe(return_dict=True) == { + "name": "RF Classifier Select From Model", + "parameters": { + "number_features": 5, + "n_estimators": 10, + "max_depth": None, + "percent_features": 0.3, + "threshold": -np.inf, + "n_jobs": -1, + }, + } + assert feature_selection_reg.describe(return_dict=True) == { + "name": "RF Regressor Select From Model", + "parameters": { + "number_features": 5, + "n_estimators": 10, + "max_depth": None, + "percent_features": 0.3, + "threshold": -np.inf, + "n_jobs": -1, + }, + } + assert drop_col_transformer.describe(return_dict=True) == { + "name": "Drop Columns Transformer", + "parameters": {"columns": ["col_one", "col_two"]}, + } + assert drop_null_transformer.describe(return_dict=True) == { + "name": "Drop Null Columns Transformer", + "parameters": {"pct_null_threshold": 1.0}, + } + assert datetime.describe(return_dict=True) == { + "name": "DateTime Featurization Component", + "parameters": { + "features_to_extract": ["year", "month", "day_of_week", "hour"], + "encode_as_categories": False, + "date_index": None, + }, + } + assert text_featurizer.describe(return_dict=True) == { + "name": "Text Featurization Component", + "parameters": {}, + } + assert lsa.describe(return_dict=True) == { + "name": "LSA Transformer", + "parameters": {}, + } + assert pca.describe(return_dict=True) == { + "name": "PCA Transformer", + "parameters": {"n_components": None, "variance": 0.95}, + } + assert lda.describe(return_dict=True) == { + "name": "Linear Discriminant Analysis Transformer", + "parameters": {"n_components": None}, + } + assert ft.describe(return_dict=True) == { + "name": "DFS Transformer", + "parameters": {"index": "index"}, + } + assert us.describe(return_dict=True) == { + "name": "Undersampler", + "parameters": { + "sampling_ratio": 0.25, + "sampling_ratio_dict": None, + "min_samples": 100, + "min_percentage": 0.1, + }, + } try: smote = SMOTESampler() - assert smote.describe(return_dict=True) == {'name': 'SMOTE Oversampler', 'parameters': {'sampling_ratio': 0.25, "sampling_ratio_dict": None, 'k_neighbors': 5, 'n_jobs': -1}} + assert smote.describe(return_dict=True) == { + "name": "SMOTE Oversampler", + "parameters": { + "sampling_ratio": 0.25, + "sampling_ratio_dict": None, + "k_neighbors": 5, + "n_jobs": -1, + }, + } smote = SMOTENCSampler() - assert smote.describe(return_dict=True) == {'name': 'SMOTENC Oversampler', 'parameters': {'sampling_ratio': 0.25, "sampling_ratio_dict": None, 'k_neighbors': 5, 'n_jobs': -1}} + assert smote.describe(return_dict=True) == { + "name": "SMOTENC Oversampler", + "parameters": { + "sampling_ratio": 0.25, + "sampling_ratio_dict": None, + "k_neighbors": 5, + "n_jobs": -1, + }, + } smote = SMOTENSampler() - assert smote.describe(return_dict=True) == {'name': 'SMOTEN Oversampler', 'parameters': {'sampling_ratio': 0.25, "sampling_ratio_dict": None, 'k_neighbors': 5, 'n_jobs': -1}} + assert smote.describe(return_dict=True) == { + "name": "SMOTEN Oversampler", + "parameters": { + "sampling_ratio": 0.25, + "sampling_ratio_dict": None, + "k_neighbors": 5, + "n_jobs": -1, + }, + } except ImportError: pass # testing estimators @@ -209,39 +321,176 @@ def test_describe_component(): linear_regressor = LinearRegressor() svm_classifier = SVMClassifier() svm_regressor = SVMRegressor() - assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}} - assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}} - assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}} - assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.0001, 'l1_ratio': 0.15, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}} - assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.0001, 'l1_ratio': 0.15, 'max_iter': 1000, 'normalize': False}} - assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} - assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} - assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} - assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} - assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}} - assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}} - assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}} + assert base_classifier.describe(return_dict=True) == { + "name": "Baseline Classifier", + "parameters": {"strategy": "mode"}, + } + assert base_regressor.describe(return_dict=True) == { + "name": "Baseline Regressor", + "parameters": {"strategy": "mean"}, + } + assert lr_classifier.describe(return_dict=True) == { + "name": "Logistic Regression Classifier", + "parameters": { + "penalty": "l2", + "C": 1.0, + "n_jobs": -1, + "multi_class": "auto", + "solver": "lbfgs", + }, + } + assert en_classifier.describe(return_dict=True) == { + "name": "Elastic Net Classifier", + "parameters": { + "alpha": 0.0001, + "l1_ratio": 0.15, + "n_jobs": -1, + "max_iter": 1000, + "loss": "log", + "penalty": "elasticnet", + }, + } + assert en_regressor.describe(return_dict=True) == { + "name": "Elastic Net Regressor", + "parameters": { + "alpha": 0.0001, + "l1_ratio": 0.15, + "max_iter": 1000, + "normalize": False, + }, + } + assert et_classifier.describe(return_dict=True) == { + "name": "Extra Trees Classifier", + "parameters": { + "n_estimators": 10, + "max_features": "auto", + "max_depth": 6, + "min_samples_split": 2, + "min_weight_fraction_leaf": 0.0, + "n_jobs": -1, + }, + } + assert et_regressor.describe(return_dict=True) == { + "name": "Extra Trees Regressor", + "parameters": { + "n_estimators": 10, + "max_features": "auto", + "max_depth": 6, + "min_samples_split": 2, + "min_weight_fraction_leaf": 0.0, + "n_jobs": -1, + }, + } + assert rf_classifier.describe(return_dict=True) == { + "name": "Random Forest Classifier", + "parameters": {"n_estimators": 10, "max_depth": 3, "n_jobs": -1}, + } + assert rf_regressor.describe(return_dict=True) == { + "name": "Random Forest Regressor", + "parameters": {"n_estimators": 10, "max_depth": 3, "n_jobs": -1}, + } + assert linear_regressor.describe(return_dict=True) == { + "name": "Linear Regressor", + "parameters": {"fit_intercept": True, "normalize": False, "n_jobs": -1}, + } + assert svm_classifier.describe(return_dict=True) == { + "name": "SVM Classifier", + "parameters": { + "C": 1.0, + "kernel": "rbf", + "gamma": "scale", + "probability": True, + }, + } + assert svm_regressor.describe(return_dict=True) == { + "name": "SVM Regressor", + "parameters": {"C": 1.0, "kernel": "rbf", "gamma": "scale"}, + } try: - xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) - xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) - assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} - assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} + xgb_classifier = XGBoostClassifier( + eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75 + ) + xgb_regressor = XGBoostRegressor( + eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75 + ) + assert xgb_classifier.describe(return_dict=True) == { + "name": "XGBoost Classifier", + "parameters": { + "eta": 0.1, + "max_depth": 3, + "min_child_weight": 1, + "n_estimators": 75, + }, + } + assert xgb_regressor.describe(return_dict=True) == { + "name": "XGBoost Regressor", + "parameters": { + "eta": 0.1, + "max_depth": 3, + "min_child_weight": 1, + "n_estimators": 75, + }, + } except ImportError: pass try: cb_classifier = CatBoostClassifier() cb_regressor = CatBoostRegressor() - assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}} - assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}} + assert cb_classifier.describe(return_dict=True) == { + "name": "CatBoost Classifier", + "parameters": { + "allow_writing_files": False, + "n_estimators": 10, + "eta": 0.03, + "max_depth": 6, + "bootstrap_type": None, + "silent": True, + }, + } + assert cb_regressor.describe(return_dict=True) == { + "name": "CatBoost Regressor", + "parameters": { + "allow_writing_files": False, + "n_estimators": 10, + "eta": 0.03, + "max_depth": 6, + "bootstrap_type": None, + "silent": False, + }, + } except ImportError: pass try: lg_classifier = LightGBMClassifier() lg_regressor = LightGBMRegressor() - assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, - 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} - assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31, - 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} + assert lg_classifier.describe(return_dict=True) == { + "name": "LightGBM Classifier", + "parameters": { + "boosting_type": "gbdt", + "learning_rate": 0.1, + "n_estimators": 100, + "max_depth": 0, + "num_leaves": 31, + "min_child_samples": 20, + "n_jobs": -1, + "bagging_fraction": 0.9, + "bagging_freq": 0, + }, + } + assert lg_regressor.describe(return_dict=True) == { + "name": "LightGBM Regressor", + "parameters": { + "boosting_type": "gbdt", + "learning_rate": 0.1, + "n_estimators": 20, + "max_depth": 0, + "num_leaves": 31, + "min_child_samples": 20, + "n_jobs": -1, + "bagging_fraction": 0.9, + "bagging_freq": 0, + }, + } except ImportError: pass @@ -278,50 +527,72 @@ def fit(self, X, y=None): return self component = MockComponent() - with pytest.raises(MethodPropertyNotFoundError, match="Component requires a fit method or a component_obj that implements fit"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Component requires a fit method or a component_obj that implements fit", + ): component.fit(X) estimator = MockEstimator() estimator._is_fitted = True - with pytest.raises(MethodPropertyNotFoundError, match="Estimator requires a predict method or a component_obj that implements predict"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Estimator requires a predict method or a component_obj that implements predict", + ): estimator.predict(X) - with pytest.raises(MethodPropertyNotFoundError, match="Estimator requires a predict_proba method or a component_obj that implements predict_proba"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Estimator requires a predict_proba method or a component_obj that implements predict_proba", + ): estimator.predict_proba(X) - with pytest.raises(MethodPropertyNotFoundError, match="Estimator requires a feature_importance property or a component_obj that implements feature_importances_"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Estimator requires a feature_importance property or a component_obj that implements feature_importances_", + ): estimator.feature_importance transformer = MockTransformer() transformer_with_fit = MockTransformerWithFit() transformer._is_fitted = True - with pytest.raises(MethodPropertyNotFoundError, match="Component requires a fit method or a component_obj that implements fit"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Component requires a fit method or a component_obj that implements fit", + ): transformer.fit(X, y) - with pytest.raises(MethodPropertyNotFoundError, match="Transformer requires a transform method or a component_obj that implements transform"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Transformer requires a transform method or a component_obj that implements transform", + ): transformer.transform(X) - with pytest.raises(MethodPropertyNotFoundError, match="Component requires a fit method or a component_obj that implements fit"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Component requires a fit method or a component_obj that implements fit", + ): transformer.fit_transform(X) - with pytest.raises(MethodPropertyNotFoundError, match="Transformer requires a transform method or a component_obj that implements transform"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Transformer requires a transform method or a component_obj that implements transform", + ): transformer_with_fit.fit_transform(X) def test_component_fit(X_y_binary): X, y = X_y_binary - class MockEstimator(): + class MockEstimator: def fit(self, X, y): pass class MockComponent(Estimator): - name = 'Mock Estimator' + name = "Mock Estimator" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] hyperparameter_ranges = {} def __init__(self): parameters = {} est = MockEstimator() - super().__init__(parameters=parameters, - component_obj=est, - random_seed=0) + super().__init__(parameters=parameters, component_obj=est, random_seed=0) est = MockComponent() assert isinstance(est.fit(X, y), ComponentBase) @@ -339,9 +610,7 @@ def fit_transform(self, X, y=None): def __init__(self): parameters = {} - super().__init__(parameters=parameters, - component_obj=None, - random_seed=0) + super().__init__(parameters=parameters, component_obj=None, random_seed=0) class MockTransformerWithFitTransformButError(Transformer): name = "Mock Transformer" @@ -352,9 +621,7 @@ def fit_transform(self, X, y=None): def __init__(self): parameters = {} - super().__init__(parameters=parameters, - component_obj=None, - random_seed=0) + super().__init__(parameters=parameters, component_obj=None, random_seed=0) class MockTransformerWithFitAndTransform(Transformer): name = "Mock Transformer" @@ -368,9 +635,7 @@ def transform(self, X, y=None): def __init__(self): parameters = {} - super().__init__(parameters=parameters, - component_obj=None, - random_seed=0) + super().__init__(parameters=parameters, component_obj=None, random_seed=0) class MockTransformerWithOnlyFit(Transformer): name = "Mock Transformer" @@ -381,9 +646,7 @@ def fit(self, X, y=None): def __init__(self): parameters = {} - super().__init__(parameters=parameters, - component_obj=None, - random_seed=0) + super().__init__(parameters=parameters, component_obj=None, random_seed=0) # convert data to pd DataFrame, because the component classes don't # standardize to pd DataFrame @@ -432,23 +695,30 @@ def test_component_describe(test_classes, caplog): def test_component_parameters_getter(test_classes): MockComponent, _, _ = test_classes - component = MockComponent({'test': 'parameter'}) - assert component.parameters == {'test': 'parameter'} - component.parameters['test'] = 'new' - assert component.parameters == {'test': 'parameter'} + component = MockComponent({"test": "parameter"}) + assert component.parameters == {"test": "parameter"} + component.parameters["test"] = "new" + assert component.parameters == {"test": "parameter"} -def test_component_parameters_init(logistic_regression_binary_pipeline_class, - linear_regression_pipeline_class): +def test_component_parameters_init( + logistic_regression_binary_pipeline_class, linear_regression_pipeline_class +): for component_class in all_components(): - print('Testing component {}'.format(component_class.name)) + print("Testing component {}".format(component_class.name)) try: component = component_class() except EnsembleMissingPipelinesError: if component_class == StackedEnsembleClassifier: - component = component_class(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})]) + component = component_class( + input_pipelines=[ + logistic_regression_binary_pipeline_class(parameters={}) + ] + ) elif component_class == StackedEnsembleRegressor: - component = component_class(input_pipelines=[linear_regression_pipeline_class(parameters={})]) + component = component_class( + input_pipelines=[linear_regression_pipeline_class(parameters={})] + ) parameters = component.parameters component2 = component_class(**parameters) @@ -458,7 +728,7 @@ def test_component_parameters_init(logistic_regression_binary_pipeline_class, def test_clone_init(): - params = {'param_a': 2, 'param_b': 11} + params = {"param_a": 2, "param_b": 11} clf = MockFitComponent(**params) clf_clone = clf.clone() assert clf.parameters == clf_clone.parameters @@ -467,7 +737,7 @@ def test_clone_init(): def test_clone_fitted(X_y_binary): X, y = X_y_binary - params = {'param_a': 3, 'param_b': 7} + params = {"param_a": 3, "param_b": 7} clf = MockFitComponent(**params) clf.fit(X, y) predicted = clf.predict(X) @@ -476,7 +746,7 @@ def test_clone_fitted(X_y_binary): assert clf_clone.random_seed == clf.random_seed assert clf.parameters == clf_clone.parameters - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): clf_clone.predict(X) clf_clone.fit(X, y) @@ -496,7 +766,7 @@ def test_components_init_kwargs(): obj_class = component._component_obj.__class__.__name__ module = component._component_obj.__module__ importlib.import_module(module, obj_class) - patched = module + '.' + obj_class + '.__init__' + patched = module + "." + obj_class + ".__init__" def all_init(self, *args, **kwargs): for k, v in kwargs.items(): @@ -505,7 +775,7 @@ def all_init(self, *args, **kwargs): with patch(patched, new=all_init) as _: component = component_class(test_arg="test") component_with_different_kwargs = component_class(diff_test_arg="test") - assert component.parameters['test_arg'] == "test" + assert component.parameters["test_arg"] == "test" if not isinstance(component, PolynomialDetrender): assert component._component_obj.test_arg == "test" # Test equality of different components with same or different kwargs @@ -526,24 +796,33 @@ def test_transformer_transform_output_type(X_y_binary): y_list = list(y_np) X_df_no_col_names = pd.DataFrame(X_np) range_index = pd.RangeIndex(start=0, stop=X_np.shape[1], step=1) - X_df_with_col_names = pd.DataFrame(X_np, columns=['x' + str(i) for i in range(X_np.shape[1])]) + X_df_with_col_names = pd.DataFrame( + X_np, columns=["x" + str(i) for i in range(X_np.shape[1])] + ) y_series_no_name = pd.Series(y_np) - y_series_with_name = pd.Series(y_np, name='target') - datatype_combos = [(X_np, y_np, range_index), - (X_np, y_list, range_index), - (X_df_no_col_names, y_series_no_name, range_index), - (X_df_with_col_names, y_series_with_name, X_df_with_col_names.columns)] + y_series_with_name = pd.Series(y_np, name="target") + datatype_combos = [ + (X_np, y_np, range_index), + (X_np, y_list, range_index), + (X_df_no_col_names, y_series_no_name, range_index), + (X_df_with_col_names, y_series_with_name, X_df_with_col_names.columns), + ] for component_class in _all_transformers(): if component_class == PolynomialDetrender: # Skipping because this test is handled in test_polynomial_detrender continue - print('Testing transformer {}'.format(component_class.name)) + print("Testing transformer {}".format(component_class.name)) for X, y, X_cols_expected in datatype_combos: - print('Checking output of transform for transformer "{}" on X type {} cols {}, y type {} name {}' - .format(component_class.name, type(X), - X.columns if isinstance(X, pd.DataFrame) else None, type(y), - y.name if isinstance(y, pd.Series) else None)) + print( + 'Checking output of transform for transformer "{}" on X type {} cols {}, y type {} name {}'.format( + component_class.name, + type(X), + X.columns if isinstance(X, pd.DataFrame) else None, + type(y), + y.name if isinstance(y, pd.Series) else None, + ) + ) component = component_class() # SMOTE will throw an error if we pass a ratio lower than the current class balance @@ -559,7 +838,7 @@ def test_transformer_transform_output_type(X_y_binary): if isinstance(component, TargetImputer): assert isinstance(transform_output[0], pd.DataFrame) assert isinstance(transform_output[1], pd.Series) - elif 'sampler' in component.name: + elif "sampler" in component.name: assert isinstance(transform_output[0], pd.DataFrame) assert transform_output[1] is None else: @@ -567,7 +846,9 @@ def test_transformer_transform_output_type(X_y_binary): if isinstance(component, SelectColumns): assert transform_output.shape == (X.shape[0], 0) - elif isinstance(component, PCA) or isinstance(component, LinearDiscriminantAnalysis): + elif isinstance(component, PCA) or isinstance( + component, LinearDiscriminantAnalysis + ): assert transform_output.shape[0] == X.shape[0] assert transform_output.shape[1] <= X.shape[1] elif isinstance(component, DFSTransformer): @@ -581,14 +862,14 @@ def test_transformer_transform_output_type(X_y_binary): assert transform_output[0].shape == X.shape assert transform_output[1].shape[0] == X.shape[0] assert len(transform_output[1].shape) == 1 - elif 'sampler' in component.name: + elif "sampler" in component.name: assert transform_output[0].shape == X.shape else: assert transform_output.shape == X.shape - assert (list(transform_output.columns) == list(X_cols_expected)) + assert list(transform_output.columns) == list(X_cols_expected) transform_output = component.fit_transform(X, y=y) - if isinstance(component, TargetImputer) or 'sampler' in component.name: + if isinstance(component, TargetImputer) or "sampler" in component.name: assert isinstance(transform_output[0], pd.DataFrame) assert isinstance(transform_output[1], pd.Series) else: @@ -596,7 +877,9 @@ def test_transformer_transform_output_type(X_y_binary): if isinstance(component, SelectColumns): assert transform_output.shape == (X.shape[0], 0) - elif isinstance(component, PCA) or isinstance(component, LinearDiscriminantAnalysis): + elif isinstance(component, PCA) or isinstance( + component, LinearDiscriminantAnalysis + ): assert transform_output.shape[0] == X.shape[0] assert transform_output.shape[1] <= X.shape[1] elif isinstance(component, DFSTransformer): @@ -606,20 +889,36 @@ def test_transformer_transform_output_type(X_y_binary): assert transform_output[0].shape == X.shape assert transform_output[1].shape[0] == X.shape[0] assert len(transform_output[1].shape) == 1 - elif 'sampler' in component.name: + elif "sampler" in component.name: assert transform_output[0].shape == X.shape assert transform_output[1].shape[0] == X.shape[0] else: assert transform_output.shape == X.shape - assert (list(transform_output.columns) == list(X_cols_expected)) + assert list(transform_output.columns) == list(X_cols_expected) -@pytest.mark.parametrize("cls", [cls for cls in all_components() if cls not in [StackedEnsembleRegressor, StackedEnsembleClassifier]]) +@pytest.mark.parametrize( + "cls", + [ + cls + for cls in all_components() + if cls not in [StackedEnsembleRegressor, StackedEnsembleClassifier] + ], +) def test_default_parameters(cls): - assert cls.default_parameters == cls().parameters, f"{cls.__name__}'s default parameters don't match __init__." - - -@pytest.mark.parametrize("cls", [cls for cls in all_components() if cls not in [StackedEnsembleRegressor, StackedEnsembleClassifier]]) + assert ( + cls.default_parameters == cls().parameters + ), f"{cls.__name__}'s default parameters don't match __init__." + + +@pytest.mark.parametrize( + "cls", + [ + cls + for cls in all_components() + if cls not in [StackedEnsembleRegressor, StackedEnsembleClassifier] + ], +) def test_default_parameters_raise_no_warnings(cls): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -628,7 +927,7 @@ def test_default_parameters_raise_no_warnings(cls): def test_estimator_check_for_fit(X_y_binary): - class MockEstimatorObj(): + class MockEstimatorObj: def __init__(self): pass @@ -648,17 +947,19 @@ def predict_proba(self, X): class MockEstimator(Estimator): name = "Mock Estimator" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] def __init__(self, parameters=None, component_obj=None, random_seed=0): est = MockEstimatorObj() - super().__init__(parameters=parameters, component_obj=est, random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=est, random_seed=random_seed + ) X, y = X_y_binary est = MockEstimator() - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): est.predict(X) - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): est.predict_proba(X) est.fit(X, y) @@ -685,17 +986,21 @@ class MockTransformer(Transformer): def __init__(self, parameters=None, component_obj=None, random_seed=0): transformer = MockTransformerObj() - super().__init__(parameters=parameters, component_obj=transformer, random_seed=random_seed) + super().__init__( + parameters=parameters, + component_obj=transformer, + random_seed=random_seed, + ) def inverse_transform(self, X, y=None): return X, y X, y = X_y_binary trans = MockTransformer() - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): trans.transform(X) - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): trans.inverse_transform(X, y) trans.fit(X, y) @@ -731,9 +1036,9 @@ def transform(self, X): transformer = MockTransformerWithOverride() transformer_subclass = MockTransformerWithOverrideSubclass() - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): transformer.transform(X) - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): transformer_subclass.transform(X) transformer.fit(X, y) @@ -744,7 +1049,11 @@ def transform(self, X): def test_all_transformers_needs_fitting(): for component_class in _all_transformers() + _all_estimators(): - if component_class.__name__ in ['DropColumns', 'SelectColumns', 'DelayedFeatureTransformer']: + if component_class.__name__ in [ + "DropColumns", + "SelectColumns", + "DelayedFeatureTransformer", + ]: assert not component_class.needs_fitting else: assert component_class.needs_fitting @@ -764,7 +1073,9 @@ def test_all_transformers_check_fit(X_y_binary): # handled in test_oversamplers continue - with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'): + with pytest.raises( + ComponentNotYetFittedError, match=f"You must fit {component_class.__name__}" + ): component.transform(X, y) component.fit(X, y) @@ -777,37 +1088,64 @@ def test_all_transformers_check_fit(X_y_binary): component.transform(X, y) -def test_all_estimators_check_fit(X_y_binary, ts_data, test_estimator_needs_fitting_false, helper_functions): - estimators_to_check = [estimator for estimator in _all_estimators() if estimator not in [StackedEnsembleClassifier, StackedEnsembleRegressor, TimeSeriesBaselineEstimator]] + [test_estimator_needs_fitting_false] +def test_all_estimators_check_fit( + X_y_binary, ts_data, test_estimator_needs_fitting_false, helper_functions +): + estimators_to_check = [ + estimator + for estimator in _all_estimators() + if estimator + not in [ + StackedEnsembleClassifier, + StackedEnsembleRegressor, + TimeSeriesBaselineEstimator, + ] + ] + [test_estimator_needs_fitting_false] for component_class in estimators_to_check: if not component_class.needs_fitting: continue - if ProblemTypes.TIME_SERIES_REGRESSION in component_class.supported_problem_types: + if ( + ProblemTypes.TIME_SERIES_REGRESSION + in component_class.supported_problem_types + ): X, y = ts_data else: X, y = X_y_binary component = helper_functions.safe_init_component_with_njobs_1(component_class) - with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'): + with pytest.raises( + ComponentNotYetFittedError, match=f"You must fit {component_class.__name__}" + ): component.predict(X) - if ProblemTypes.BINARY in component.supported_problem_types or ProblemTypes.MULTICLASS in component.supported_problem_types: - with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'): + if ( + ProblemTypes.BINARY in component.supported_problem_types + or ProblemTypes.MULTICLASS in component.supported_problem_types + ): + with pytest.raises( + ComponentNotYetFittedError, + match=f"You must fit {component_class.__name__}", + ): component.predict_proba(X) - with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'): + with pytest.raises( + ComponentNotYetFittedError, match=f"You must fit {component_class.__name__}" + ): component.feature_importance component.fit(X, y) - if ProblemTypes.BINARY in component.supported_problem_types or ProblemTypes.MULTICLASS in component.supported_problem_types: + if ( + ProblemTypes.BINARY in component.supported_problem_types + or ProblemTypes.MULTICLASS in component.supported_problem_types + ): component.predict_proba(X) component.predict(X) component.feature_importance -@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["li", "np", "pd", "ww"]) def test_all_transformers_check_fit_input_type(data_type, X_y_binary, make_data_type): X, y = X_y_binary X = make_data_type(data_type, X) @@ -821,11 +1159,15 @@ def test_all_transformers_check_fit_input_type(data_type, X_y_binary, make_data_ component.fit(X, y) -def test_no_fitting_required_components(X_y_binary, test_estimator_needs_fitting_false, helper_functions): +def test_no_fitting_required_components( + X_y_binary, test_estimator_needs_fitting_false, helper_functions +): X, y = X_y_binary for component_class in all_components() + [test_estimator_needs_fitting_false]: if not component_class.needs_fitting: - component = helper_functions.safe_init_component_with_njobs_1(component_class) + component = helper_functions.safe_init_component_with_njobs_1( + component_class + ) if issubclass(component_class, Estimator): component.predict(X) else: @@ -833,18 +1175,36 @@ def test_no_fitting_required_components(X_y_binary, test_estimator_needs_fitting def test_serialization(X_y_binary, ts_data, tmpdir, helper_functions): - path = os.path.join(str(tmpdir), 'component.pkl') + path = os.path.join(str(tmpdir), "component.pkl") for component_class in all_components(): - print('Testing serialization of component {}'.format(component_class.name)) + print("Testing serialization of component {}".format(component_class.name)) try: - component = helper_functions.safe_init_component_with_njobs_1(component_class) + component = helper_functions.safe_init_component_with_njobs_1( + component_class + ) except EnsembleMissingPipelinesError: - if (component_class == StackedEnsembleClassifier): - component = component_class(input_pipelines=[BinaryClassificationPipeline([RandomForestClassifier], - parameters={"Random Forest Classifier": {"n_jobs": 1}})]) - elif (component_class == StackedEnsembleRegressor): - component = component_class(input_pipelines=[RegressionPipeline([RandomForestRegressor], parameters={"Random Forest Regressor": {"n_jobs": 1}})]) - if isinstance(component, Estimator) and ProblemTypes.TIME_SERIES_REGRESSION in component.supported_problem_types: + if component_class == StackedEnsembleClassifier: + component = component_class( + input_pipelines=[ + BinaryClassificationPipeline( + [RandomForestClassifier], + parameters={"Random Forest Classifier": {"n_jobs": 1}}, + ) + ] + ) + elif component_class == StackedEnsembleRegressor: + component = component_class( + input_pipelines=[ + RegressionPipeline( + [RandomForestRegressor], + parameters={"Random Forest Regressor": {"n_jobs": 1}}, + ) + ] + ) + if ( + isinstance(component, Estimator) + and ProblemTypes.TIME_SERIES_REGRESSION in component.supported_problem_types + ): X, y = ts_data else: X, y = X_y_binary @@ -855,40 +1215,60 @@ def test_serialization(X_y_binary, ts_data, tmpdir, helper_functions): component.save(path, pickle_protocol=pickle_protocol) loaded_component = ComponentBase.load(path) assert component.parameters == loaded_component.parameters - assert component.describe(return_dict=True) == loaded_component.describe(return_dict=True) - if (issubclass(component_class, Estimator) and not (isinstance(component, StackedEnsembleClassifier) or isinstance(component, StackedEnsembleRegressor))): - assert (component.feature_importance == loaded_component.feature_importance).all() - - -@patch('cloudpickle.dump') + assert component.describe(return_dict=True) == loaded_component.describe( + return_dict=True + ) + if issubclass(component_class, Estimator) and not ( + isinstance(component, StackedEnsembleClassifier) + or isinstance(component, StackedEnsembleRegressor) + ): + assert ( + component.feature_importance == loaded_component.feature_importance + ).all() + + +@patch("cloudpickle.dump") def test_serialization_protocol(mock_cloudpickle_dump, tmpdir): - path = os.path.join(str(tmpdir), 'pipe.pkl') + path = os.path.join(str(tmpdir), "pipe.pkl") component = LogisticRegressionClassifier() component.save(path) assert len(mock_cloudpickle_dump.call_args_list) == 1 - assert mock_cloudpickle_dump.call_args_list[0][1]['protocol'] == cloudpickle.DEFAULT_PROTOCOL + assert ( + mock_cloudpickle_dump.call_args_list[0][1]["protocol"] + == cloudpickle.DEFAULT_PROTOCOL + ) mock_cloudpickle_dump.reset_mock() component.save(path, pickle_protocol=42) assert len(mock_cloudpickle_dump.call_args_list) == 1 - assert mock_cloudpickle_dump.call_args_list[0][1]['protocol'] == 42 + assert mock_cloudpickle_dump.call_args_list[0][1]["protocol"] == 42 @pytest.mark.parametrize("estimator_class", _all_estimators()) -def test_estimators_accept_all_kwargs(estimator_class, - logistic_regression_binary_pipeline_class, - linear_regression_pipeline_class): +def test_estimators_accept_all_kwargs( + estimator_class, + logistic_regression_binary_pipeline_class, + linear_regression_pipeline_class, +): try: estimator = estimator_class() except EnsembleMissingPipelinesError: if estimator_class == StackedEnsembleClassifier: - estimator = estimator_class(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})]) + estimator = estimator_class( + input_pipelines=[ + logistic_regression_binary_pipeline_class(parameters={}) + ] + ) elif estimator_class == StackedEnsembleRegressor: - estimator = estimator_class(input_pipelines=[linear_regression_pipeline_class(parameters={})]) + estimator = estimator_class( + input_pipelines=[linear_regression_pipeline_class(parameters={})] + ) if estimator._component_obj is None: - pytest.skip(f"Skipping {estimator_class} because does not have component object.") + pytest.skip( + f"Skipping {estimator_class} because does not have component object." + ) if estimator_class.model_family == ModelFamily.ENSEMBLE: params = estimator.parameters else: @@ -918,6 +1298,7 @@ class MockComponent(ComponentBase): class MockEstimatorSubclass(MockComponent): pass + assert MockComponent() != MockEstimatorSubclass() @@ -927,15 +1308,15 @@ class MockComponent(ComponentBase): model_family = ModelFamily.NONE def __init__(self, param_1=0, param_2=0, random_seed=0, **kwargs): - parameters = {"param_1": param_1, - "param_2": param_2} + parameters = {"param_1": param_1, "param_2": param_2} parameters.update(kwargs) - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y=None): return self + # Test self-equality mock_component = MockComponent() assert mock_component == mock_component @@ -958,13 +1339,19 @@ def fit(self, X, y=None): @pytest.mark.parametrize("component_class", all_components()) -def test_component_equality_all_components(component_class, - logistic_regression_binary_pipeline_class, - linear_regression_pipeline_class): +def test_component_equality_all_components( + component_class, + logistic_regression_binary_pipeline_class, + linear_regression_pipeline_class, +): if component_class == StackedEnsembleClassifier: - component = component_class(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})]) + component = component_class( + input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})] + ) elif component_class == StackedEnsembleRegressor: - component = component_class(input_pipelines=[linear_regression_pipeline_class(parameters={})]) + component = component_class( + input_pipelines=[linear_regression_pipeline_class(parameters={})] + ) else: component = component_class() parameters = component.parameters @@ -988,65 +1375,79 @@ def test_component_equality_with_subclasses(test_classes): def test_mock_component_str(test_classes): MockComponent, MockEstimator, MockTransformer = test_classes - assert str(MockComponent()) == 'Mock Component' - assert str(MockEstimator()) == 'Mock Estimator' - assert str(MockTransformer()) == 'Mock Transformer' + assert str(MockComponent()) == "Mock Component" + assert str(MockEstimator()) == "Mock Estimator" + assert str(MockTransformer()) == "Mock Transformer" def test_mock_component_repr(): component = MockFitComponent() - assert repr(component) == 'MockFitComponent(param_a=2, param_b=10)' + assert repr(component) == "MockFitComponent(param_a=2, param_b=10)" component_with_params = MockFitComponent(param_a=29, param_b=None, random_seed=42) - assert repr(component_with_params) == 'MockFitComponent(param_a=29, param_b=None)' + assert repr(component_with_params) == "MockFitComponent(param_a=29, param_b=None)" - component_with_nan = MockFitComponent(param_a=np.nan, param_b=float('nan')) - assert repr(component_with_nan) == 'MockFitComponent(param_a=np.nan, param_b=np.nan)' + component_with_nan = MockFitComponent(param_a=np.nan, param_b=float("nan")) + assert ( + repr(component_with_nan) == "MockFitComponent(param_a=np.nan, param_b=np.nan)" + ) - component_with_inf = MockFitComponent(param_a=np.inf, param_b=float('-inf')) - assert repr(component_with_inf) == "MockFitComponent(param_a=float('inf'), param_b=float('-inf'))" + component_with_inf = MockFitComponent(param_a=np.inf, param_b=float("-inf")) + assert ( + repr(component_with_inf) + == "MockFitComponent(param_a=float('inf'), param_b=float('-inf'))" + ) @pytest.mark.parametrize("component_class", all_components()) -def test_component_str(component_class, logistic_regression_binary_pipeline_class, linear_regression_pipeline_class): +def test_component_str( + component_class, + logistic_regression_binary_pipeline_class, + linear_regression_pipeline_class, +): try: component = component_class() except EnsembleMissingPipelinesError: if component_class == StackedEnsembleClassifier: - component = component_class(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})]) + component = component_class( + input_pipelines=[ + logistic_regression_binary_pipeline_class(parameters={}) + ] + ) elif component_class == StackedEnsembleRegressor: - component = component_class(input_pipelines=[linear_regression_pipeline_class(parameters={})]) + component = component_class( + input_pipelines=[linear_regression_pipeline_class(parameters={})] + ) assert str(component) == component.name -@pytest.mark.parametrize("categorical", [{ - "type": Categorical(["mean", "median", "mode"]), - "categories": Categorical(["blue", "green"]) -}, - { - "type": ["mean", "median", "mode"], - "categories": ["blue", "green"] -} -]) +@pytest.mark.parametrize( + "categorical", + [ + { + "type": Categorical(["mean", "median", "mode"]), + "categories": Categorical(["blue", "green"]), + }, + {"type": ["mean", "median", "mode"], "categories": ["blue", "green"]}, + ], +) def test_categorical_hyperparameters(X_y_binary, categorical): X, y = X_y_binary - class MockEstimator(): + class MockEstimator: def fit(self, X, y): pass class MockComponent(Estimator): - name = 'Mock Estimator' + name = "Mock Estimator" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] hyperparameter_ranges = categorical def __init__(self, agg_type, category="green"): parameters = {"type": agg_type, "categories": category} est = MockEstimator() - super().__init__(parameters=parameters, - component_obj=est, - random_seed=0) + super().__init__(parameters=parameters, component_obj=est, random_seed=0) assert MockComponent(agg_type="mean").fit(X, y) assert MockComponent(agg_type="moat", category="blue").fit(X, y) @@ -1067,18 +1468,24 @@ def test_generate_code_errors(): def test_generate_code(): - expected_code = "from evalml.pipelines.components.estimators.classifiers.logistic_regression import LogisticRegressionClassifier" \ - "\n\nlogisticRegressionClassifier = LogisticRegressionClassifier(**{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'})" + expected_code = ( + "from evalml.pipelines.components.estimators.classifiers.logistic_regression import LogisticRegressionClassifier" + "\n\nlogisticRegressionClassifier = LogisticRegressionClassifier(**{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'})" + ) component_code = generate_component_code(LogisticRegressionClassifier()) assert component_code == expected_code - expected_code = "from evalml.pipelines.components.estimators.regressors.et_regressor import ExtraTreesRegressor" \ - "\n\nextraTreesRegressor = ExtraTreesRegressor(**{'n_estimators': 50, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1})" + expected_code = ( + "from evalml.pipelines.components.estimators.regressors.et_regressor import ExtraTreesRegressor" + "\n\nextraTreesRegressor = ExtraTreesRegressor(**{'n_estimators': 50, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1})" + ) component_code = generate_component_code(ExtraTreesRegressor(n_estimators=50)) assert component_code == expected_code - expected_code = "from evalml.pipelines.components.transformers.imputers.imputer import Imputer" \ - "\n\nimputer = Imputer(**{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None})" + expected_code = ( + "from evalml.pipelines.components.transformers.imputers.imputer import Imputer" + "\n\nimputer = Imputer(**{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None})" + ) component_code = generate_component_code(Imputer()) assert component_code == expected_code @@ -1101,7 +1508,9 @@ def test_generate_code_custom(test_classes): @pytest.mark.parametrize("transformer_class", _all_transformers()) @pytest.mark.parametrize("use_custom_index", [True, False]) -def test_transformer_fit_and_transform_respect_custom_indices(use_custom_index, transformer_class, X_y_binary): +def test_transformer_fit_and_transform_respect_custom_indices( + use_custom_index, transformer_class, X_y_binary +): check_names = True if transformer_class == DFSTransformer: @@ -1109,8 +1518,10 @@ def test_transformer_fit_and_transform_respect_custom_indices(use_custom_index, if use_custom_index: pytest.skip("The DFSTransformer changes the index so we skip it.") if transformer_class == PolynomialDetrender: - pytest.skip("Skipping PolynomialDetrender because we test that it respects custom indices in " - "test_polynomial_detrender.py") + pytest.skip( + "Skipping PolynomialDetrender because we test that it respects custom indices in " + "test_polynomial_detrender.py" + ) X, y = X_y_binary @@ -1132,30 +1543,47 @@ def test_transformer_fit_and_transform_respect_custom_indices(use_custom_index, pd.testing.assert_index_equal(X.index, X_original_index) pd.testing.assert_index_equal(y.index, y_original_index) - if 'sampler' in transformer.name: + if "sampler" in transformer.name: X_t, y_t = transformer.transform(X, y) assert y_t is None elif transformer_class == TargetImputer: X_t, y_t = transformer.transform(X, y) - pd.testing.assert_index_equal(y_t.index, y_original_index, check_names=check_names) + pd.testing.assert_index_equal( + y_t.index, y_original_index, check_names=check_names + ) else: X_t = transformer.transform(X, y) - pd.testing.assert_index_equal(y.index, y_original_index, check_names=check_names) + pd.testing.assert_index_equal( + y.index, y_original_index, check_names=check_names + ) pd.testing.assert_index_equal(X_t.index, X_original_index, check_names=check_names) @pytest.mark.parametrize("estimator_class", _all_estimators()) @pytest.mark.parametrize("use_custom_index", [True, False]) -def test_estimator_fit_respects_custom_indices(use_custom_index, estimator_class, - X_y_binary, X_y_regression, ts_data, - logistic_regression_binary_pipeline_class, - linear_regression_pipeline_class, - helper_functions): +def test_estimator_fit_respects_custom_indices( + use_custom_index, + estimator_class, + X_y_binary, + X_y_regression, + ts_data, + logistic_regression_binary_pipeline_class, + linear_regression_pipeline_class, + helper_functions, +): if estimator_class == StackedEnsembleRegressor: - input_pipelines = [helper_functions.safe_init_pipeline_with_njobs_1(linear_regression_pipeline_class)] + input_pipelines = [ + helper_functions.safe_init_pipeline_with_njobs_1( + linear_regression_pipeline_class + ) + ] elif estimator_class == StackedEnsembleClassifier: - input_pipelines = [helper_functions.safe_init_pipeline_with_njobs_1(logistic_regression_binary_pipeline_class)] + input_pipelines = [ + helper_functions.safe_init_pipeline_with_njobs_1( + logistic_regression_binary_pipeline_class + ) + ] else: input_pipelines = [] diff --git a/evalml/tests/component_tests/test_datetime_featurizer.py b/evalml/tests/component_tests/test_datetime_featurizer.py index 19b44cc721..8bc9e51487 100644 --- a/evalml/tests/component_tests/test_datetime_featurizer.py +++ b/evalml/tests/component_tests/test_datetime_featurizer.py @@ -8,7 +8,7 @@ Datetime, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.pipelines.components import DateTimeFeaturizer @@ -16,31 +16,57 @@ def test_datetime_featurizer_init(): datetime_transformer = DateTimeFeaturizer() - assert datetime_transformer.parameters == {"features_to_extract": ["year", "month", "day_of_week", "hour"], - "encode_as_categories": False, - "date_index": None} - - datetime_transformer = DateTimeFeaturizer(features_to_extract=["year", "month"], encode_as_categories=True) - assert datetime_transformer.parameters == {"features_to_extract": ["year", "month"], - "encode_as_categories": True, - "date_index": None} + assert datetime_transformer.parameters == { + "features_to_extract": ["year", "month", "day_of_week", "hour"], + "encode_as_categories": False, + "date_index": None, + } + + datetime_transformer = DateTimeFeaturizer( + features_to_extract=["year", "month"], encode_as_categories=True + ) + assert datetime_transformer.parameters == { + "features_to_extract": ["year", "month"], + "encode_as_categories": True, + "date_index": None, + } with pytest.raises(ValueError, match="not valid options for features_to_extract"): DateTimeFeaturizer(features_to_extract=["invalid", "parameters"]) def test_datetime_featurizer_encodes_as_ints(): - X = pd.DataFrame({"date": ["2016-04-10 16:10:09", "2017-03-15 13:32:05", "2018-07-10 07:15:10", - "2019-08-19 20:20:20", "2020-01-03 06:45:12"]}) + X = pd.DataFrame( + { + "date": [ + "2016-04-10 16:10:09", + "2017-03-15 13:32:05", + "2018-07-10 07:15:10", + "2019-08-19 20:20:20", + "2020-01-03 06:45:12", + ] + } + ) dt = DateTimeFeaturizer() X_transformed_df = dt.fit_transform(X) - expected = pd.DataFrame({"date_year": pd.Series([2016, 2017, 2018, 2019, 2020]), - "date_month": pd.Series([3, 2, 6, 7, 0]), - "date_day_of_week": pd.Series([0, 3, 2, 1, 5]), - "date_hour": pd.Series([16, 13, 7, 20, 6])}) - feature_names = {'date_month': {'April': 3, 'March': 2, 'July': 6, 'August': 7, 'January': 0}, - 'date_day_of_week': {'Sunday': 0, 'Wednesday': 3, 'Tuesday': 2, 'Monday': 1, 'Friday': 5} - } + expected = pd.DataFrame( + { + "date_year": pd.Series([2016, 2017, 2018, 2019, 2020]), + "date_month": pd.Series([3, 2, 6, 7, 0]), + "date_day_of_week": pd.Series([0, 3, 2, 1, 5]), + "date_hour": pd.Series([16, 13, 7, 20, 6]), + } + ) + feature_names = { + "date_month": {"April": 3, "March": 2, "July": 6, "August": 7, "January": 0}, + "date_day_of_week": { + "Sunday": 0, + "Wednesday": 3, + "Tuesday": 2, + "Monday": 1, + "Friday": 5, + }, + } assert_frame_equal(expected, X_transformed_df) assert dt.get_feature_names() == feature_names @@ -56,13 +82,19 @@ def test_datetime_featurizer_encodes_as_ints(): # Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined X = pd.DataFrame({"date": ["2020-04-10", "2017-03-15", "2019-08-19"]}) X_transformed_df = dt.fit_transform(X) - expected = pd.DataFrame({"date_year": pd.Series([2020, 2017, 2019]), - "date_month": pd.Series([3, 2, 7]), - "date_day_of_week": pd.Series([5, 3, 1]), - "date_hour": pd.Series([0, 0, 0])}) + expected = pd.DataFrame( + { + "date_year": pd.Series([2020, 2017, 2019]), + "date_month": pd.Series([3, 2, 7]), + "date_day_of_week": pd.Series([5, 3, 1]), + "date_hour": pd.Series([0, 0, 0]), + } + ) assert_frame_equal(expected, X_transformed_df) - assert dt.get_feature_names() == {'date_month': {'April': 3, 'March': 2, 'August': 7}, - 'date_day_of_week': {'Friday': 5, 'Wednesday': 3, 'Monday': 1}} + assert dt.get_feature_names() == { + "date_month": {"April": 3, "March": 2, "August": 7}, + "date_day_of_week": {"Friday": 5, "Wednesday": 3, "Monday": 1}, + } dt = DateTimeFeaturizer(features_to_extract=["year", "hour"]) dt.fit_transform(X) @@ -71,17 +103,30 @@ def test_datetime_featurizer_encodes_as_ints(): def test_datetime_featurizer_transform(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["year"]) - X = pd.DataFrame({'Numerical 1': range(20), - 'Date Col 1': pd.date_range('2000-05-19', periods=20, freq='D'), - 'Date Col 2': pd.date_range('2000-02-03', periods=20, freq='W'), - 'Numerical 2': [0] * 20}) - X_test = pd.DataFrame({'Numerical 1': range(20), - 'Date Col 1': pd.date_range('2020-05-19', periods=20, freq='D'), - 'Date Col 2': pd.date_range('2020-02-03', periods=20, freq='W'), - 'Numerical 2': [0] * 20}) + X = pd.DataFrame( + { + "Numerical 1": range(20), + "Date Col 1": pd.date_range("2000-05-19", periods=20, freq="D"), + "Date Col 2": pd.date_range("2000-02-03", periods=20, freq="W"), + "Numerical 2": [0] * 20, + } + ) + X_test = pd.DataFrame( + { + "Numerical 1": range(20), + "Date Col 1": pd.date_range("2020-05-19", periods=20, freq="D"), + "Date Col 2": pd.date_range("2020-02-03", periods=20, freq="W"), + "Numerical 2": [0] * 20, + } + ) datetime_transformer.fit(X) transformed_df = datetime_transformer.transform(X_test) - assert list(transformed_df.columns) == ['Numerical 1', 'Numerical 2', 'Date Col 1_year', 'Date Col 2_year'] + assert list(transformed_df.columns) == [ + "Numerical 1", + "Numerical 2", + "Date Col 1_year", + "Date Col 2_year", + ] assert transformed_df["Date Col 1_year"].equals(pd.Series([2020] * 20)) assert transformed_df["Date Col 2_year"].equals(pd.Series([2020] * 20)) assert datetime_transformer.get_feature_names() == {} @@ -89,44 +134,81 @@ def test_datetime_featurizer_transform(): def test_datetime_featurizer_fit_transform(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["year"]) - X = pd.DataFrame({'Numerical 1': range(20), - 'Date Col 1': pd.date_range('2020-05-19', periods=20, freq='D'), - 'Date Col 2': pd.date_range('2020-02-03', periods=20, freq='W'), - 'Numerical 2': [0] * 20}) + X = pd.DataFrame( + { + "Numerical 1": range(20), + "Date Col 1": pd.date_range("2020-05-19", periods=20, freq="D"), + "Date Col 2": pd.date_range("2020-02-03", periods=20, freq="W"), + "Numerical 2": [0] * 20, + } + ) transformed_df = datetime_transformer.fit_transform(X) - assert list(transformed_df.columns) == ['Numerical 1', 'Numerical 2', 'Date Col 1_year', 'Date Col 2_year'] + assert list(transformed_df.columns) == [ + "Numerical 1", + "Numerical 2", + "Date Col 1_year", + "Date Col 2_year", + ] assert transformed_df["Date Col 1_year"].equals(pd.Series([2020] * 20)) assert transformed_df["Date Col 2_year"].equals(pd.Series([2020] * 20)) assert datetime_transformer.get_feature_names() == {} def test_datetime_featurizer_fit_transform_date_index(): - datetime_transformer = DateTimeFeaturizer(features_to_extract=["year"], date_index='Date Col 1') - X = pd.DataFrame({'Numerical 1': range(20), - 'Date Col 1': pd.date_range('2020-05-19', periods=20, freq='D'), - 'Date Col 2': pd.date_range('2020-02-03', periods=20, freq='W'), - 'Numerical 2': [0] * 20}) + datetime_transformer = DateTimeFeaturizer( + features_to_extract=["year"], date_index="Date Col 1" + ) + X = pd.DataFrame( + { + "Numerical 1": range(20), + "Date Col 1": pd.date_range("2020-05-19", periods=20, freq="D"), + "Date Col 2": pd.date_range("2020-02-03", periods=20, freq="W"), + "Numerical 2": [0] * 20, + } + ) transformed_df = datetime_transformer.fit_transform(X) - assert list(transformed_df.columns) == ['Numerical 1', 'Numerical 2', 'Date Col 1_year', 'Date Col 2_year'] - assert transformed_df["Date Col 1_year"].equals(pd.Series([2020] * 20, dtype="int64")) - assert transformed_df["Date Col 2_year"].equals(pd.Series([2020] * 20, dtype="int64")) + assert list(transformed_df.columns) == [ + "Numerical 1", + "Numerical 2", + "Date Col 1_year", + "Date Col 2_year", + ] + assert transformed_df["Date Col 1_year"].equals( + pd.Series([2020] * 20, dtype="int64") + ) + assert transformed_df["Date Col 2_year"].equals( + pd.Series([2020] * 20, dtype="int64") + ) assert datetime_transformer.get_feature_names() == {} def test_datetime_featurizer_no_col_names(): datetime_transformer = DateTimeFeaturizer() - X = pd.DataFrame(pd.Series(pd.date_range('2020-02-24', periods=10, freq='D'))) + X = pd.DataFrame(pd.Series(pd.date_range("2020-02-24", periods=10, freq="D"))) datetime_transformer.fit(X) - assert list(datetime_transformer.transform(X).columns) == ['0_year', '0_month', '0_day_of_week', '0_hour'] - assert datetime_transformer.get_feature_names() == {'0_month': {'February': 1, 'March': 2}, - '0_day_of_week': {'Monday': 1, 'Tuesday': 2, - 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, - 'Saturday': 6, 'Sunday': 0}} + assert list(datetime_transformer.transform(X).columns) == [ + "0_year", + "0_month", + "0_day_of_week", + "0_hour", + ] + assert datetime_transformer.get_feature_names() == { + "0_month": {"February": 1, "March": 2}, + "0_day_of_week": { + "Monday": 1, + "Tuesday": 2, + "Wednesday": 3, + "Thursday": 4, + "Friday": 5, + "Saturday": 6, + "Sunday": 0, + }, + } def test_datetime_featurizer_no_features_to_extract(): datetime_transformer = DateTimeFeaturizer(features_to_extract=[]) - rng = pd.date_range('2020-02-24', periods=20, freq='D') + rng = pd.date_range("2020-02-24", periods=20, freq="D") X = pd.DataFrame({"date col": rng, "numerical": [0] * len(rng)}) expected = X.copy() expected.ww.init() @@ -138,11 +220,17 @@ def test_datetime_featurizer_no_features_to_extract(): def test_datetime_featurizer_custom_features_to_extract(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["month", "year"]) - rng = pd.date_range('2020-02-24', periods=20, freq='D') + rng = pd.date_range("2020-02-24", periods=20, freq="D") X = pd.DataFrame({"date col": rng, "numerical": [0] * len(rng)}) datetime_transformer.fit(X) - assert list(datetime_transformer.transform(X).columns) == ["numerical", "date col_month", "date col_year"] - assert datetime_transformer.get_feature_names() == {"date col_month": {"February": 1, "March": 2}} + assert list(datetime_transformer.transform(X).columns) == [ + "numerical", + "date col_month", + "date col_year", + ] + assert datetime_transformer.get_feature_names() == { + "date col_month": {"February": 1, "March": 2} + } def test_datetime_featurizer_no_datetime_cols(): @@ -157,56 +245,107 @@ def test_datetime_featurizer_no_datetime_cols(): def test_datetime_featurizer_numpy_array_input(): datetime_transformer = DateTimeFeaturizer() - X = np.array([['2007-02-03'], ['2016-06-07'], ['2020-05-19']], dtype='datetime64') + X = np.array([["2007-02-03"], ["2016-06-07"], ["2020-05-19"]], dtype="datetime64") datetime_transformer.fit(X) - assert list(datetime_transformer.transform(X).columns) == ["0_year", "0_month", "0_day_of_week", "0_hour"] - assert datetime_transformer.get_feature_names() == {'0_month': {'February': 1, 'June': 5, 'May': 4}, - '0_day_of_week': {'Saturday': 6, 'Tuesday': 2}} + assert list(datetime_transformer.transform(X).columns) == [ + "0_year", + "0_month", + "0_day_of_week", + "0_hour", + ] + assert datetime_transformer.get_feature_names() == { + "0_month": {"February": 1, "June": 5, "May": 4}, + "0_day_of_week": {"Saturday": 6, "Tuesday": 2}, + } def test_datetime_featurizer_does_not_modify_input_data(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["month", "year"]) - rng = pd.date_range('2020-02-24', periods=20, freq='D') + rng = pd.date_range("2020-02-24", periods=20, freq="D") X = pd.DataFrame({"date col": rng, "numerical": [0] * len(rng)}) expected = X.copy() _ = datetime_transformer.fit_transform(X) pd.testing.assert_frame_equal(X, expected) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.to_datetime(['20190902', '20200519', '20190607'], format='%Y%m%d')), - pd.DataFrame(pd.Series([1, 2, 3])), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame( + pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d") + ), + pd.DataFrame(pd.Series([1, 2, 3])), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) @pytest.mark.parametrize("with_datetime_col", [True, False]) @pytest.mark.parametrize("encode_as_categories", [True, False]) -def test_datetime_featurizer_woodwork_custom_overrides_returned_by_components(with_datetime_col, encode_as_categories, X_df): +def test_datetime_featurizer_woodwork_custom_overrides_returned_by_components( + with_datetime_col, encode_as_categories, X_df +): override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime] if with_datetime_col: - X_df['datetime col'] = pd.to_datetime(['20200101', '20200519', '20190607'], format='%Y%m%d') + X_df["datetime col"] = pd.to_datetime( + ["20200101", "20200519", "20190607"], format="%Y%m%d" + ) for logical_type in override_types: try: X = X_df.copy() X.ww.init(logical_types={0: logical_type}) except (ww.exceptions.TypeConversionError, TypeError): continue - datetime_transformer = DateTimeFeaturizer(encode_as_categories=encode_as_categories) + datetime_transformer = DateTimeFeaturizer( + encode_as_categories=encode_as_categories + ) datetime_transformer.fit(X) transformed = datetime_transformer.transform(X) assert isinstance(transformed, pd.DataFrame) if with_datetime_col: if encode_as_categories: - datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Categorical, 'datetime col_day_of_week': Categorical, 'datetime col_hour': Integer} + datetime_col_transformed = { + "datetime col_year": Integer, + "datetime col_month": Categorical, + "datetime col_day_of_week": Categorical, + "datetime col_hour": Integer, + } else: - datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Integer, 'datetime col_day_of_week': Integer, 'datetime col_hour': Integer} - assert all(item in transformed.ww.logical_types.items() for item in datetime_col_transformed.items()) + datetime_col_transformed = { + "datetime col_year": Integer, + "datetime col_month": Integer, + "datetime col_day_of_week": Integer, + "datetime col_hour": Integer, + } + assert all( + item in transformed.ww.logical_types.items() + for item in datetime_col_transformed.items() + ) if logical_type == Datetime: if encode_as_categories: - col_transformed = {'0_year': Integer, '0_month': Categorical, '0_day_of_week': Categorical, '0_hour': Integer} + col_transformed = { + "0_year": Integer, + "0_month": Categorical, + "0_day_of_week": Categorical, + "0_hour": Integer, + } else: - col_transformed = {'0_year': Integer, '0_month': Integer, '0_day_of_week': Integer, '0_hour': Integer} - assert all(item in transformed.ww.logical_types.items() for item in col_transformed.items()) + col_transformed = { + "0_year": Integer, + "0_month": Integer, + "0_day_of_week": Integer, + "0_hour": Integer, + } + assert all( + item in transformed.ww.logical_types.items() + for item in col_transformed.items() + ) else: assert transformed.ww.logical_types[0] == logical_type diff --git a/evalml/tests/component_tests/test_decision_tree_classifier.py b/evalml/tests/component_tests/test_decision_tree_classifier.py index 34c49a0dd8..110e8c9fb8 100644 --- a/evalml/tests/component_tests/test_decision_tree_classifier.py +++ b/evalml/tests/component_tests/test_decision_tree_classifier.py @@ -11,15 +11,18 @@ def test_model_family(): def test_problem_types(): - assert set(DecisionTreeClassifier.supported_problem_types) == {ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY} + assert set(DecisionTreeClassifier.supported_problem_types) == { + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + } def test_fit_predict_binary(X_y_binary): X, y = X_y_binary - sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features='auto', random_state=0) + sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0) sk_clf.fit(X, y) y_pred_sk = sk_clf.predict(X) y_pred_proba_sk = sk_clf.predict_proba(X) @@ -36,7 +39,7 @@ def test_fit_predict_binary(X_y_binary): def test_fit_predict_multi(X_y_multi): X, y = X_y_multi - sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features='auto', random_state=0) + sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0) sk_clf.fit(X, y) y_pred_sk = sk_clf.predict(X) y_pred_proba_sk = sk_clf.predict_proba(X) @@ -56,7 +59,7 @@ def test_feature_importance(X_y_binary): X, y = X_y_binary clf = DecisionTreeClassifier() - sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features='auto', random_state=0) + sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0) sk_clf.fit(X, y) sk_feature_importance = sk_clf.feature_importances_ diff --git a/evalml/tests/component_tests/test_decision_tree_regressor.py b/evalml/tests/component_tests/test_decision_tree_regressor.py index 42944b9673..fe31d98872 100644 --- a/evalml/tests/component_tests/test_decision_tree_regressor.py +++ b/evalml/tests/component_tests/test_decision_tree_regressor.py @@ -11,14 +11,16 @@ def test_model_family(): def test_problem_types(): - assert set(DecisionTreeRegressor.supported_problem_types) == {ProblemTypes.REGRESSION, - ProblemTypes.TIME_SERIES_REGRESSION} + assert set(DecisionTreeRegressor.supported_problem_types) == { + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + } def test_fit_predict(X_y_regression): X, y = X_y_regression - sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features='auto', random_state=0) + sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="auto", random_state=0) sk_clf.fit(X, y) y_pred_sk = sk_clf.predict(X) @@ -34,7 +36,7 @@ def test_feature_importance(X_y_regression): X, y = X_y_regression clf = DecisionTreeRegressor() - sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features='auto', random_state=0) + sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="auto", random_state=0) sk_clf.fit(X, y) sk_feature_importance = sk_clf.feature_importances_ diff --git a/evalml/tests/component_tests/test_delayed_features_transformer.py b/evalml/tests/component_tests/test_delayed_features_transformer.py index 665005f1cd..5d1892da69 100644 --- a/evalml/tests/component_tests/test_delayed_features_transformer.py +++ b/evalml/tests/component_tests/test_delayed_features_transformer.py @@ -2,13 +2,7 @@ import pytest import woodwork as ww from pandas.testing import assert_frame_equal -from woodwork.logical_types import ( - Boolean, - Categorical, - Datetime, - Double, - Integer -) +from woodwork.logical_types import Boolean, Categorical, Datetime, Double, Integer from evalml.pipelines import DelayedFeatureTransformer @@ -21,10 +15,20 @@ def delayed_features_data(): def test_delayed_features_transformer_init(): - delayed_features = DelayedFeatureTransformer(max_delay=4, delay_features=True, delay_target=False, date_index="Date", - random_seed=1) - assert delayed_features.parameters == {"max_delay": 4, "delay_features": True, "delay_target": False, - "gap": 1, "date_index": "Date"} + delayed_features = DelayedFeatureTransformer( + max_delay=4, + delay_features=True, + delay_target=False, + date_index="Date", + random_seed=1, + ) + assert delayed_features.parameters == { + "max_delay": 4, + "delay_features": True, + "delay_target": False, + "gap": 1, + "date_index": "Date", + } def encode_y_as_string(y): @@ -51,235 +55,367 @@ def encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str): return X, X_answer, y, y_answer -@pytest.mark.parametrize('encode_X_as_str', [True, False]) -@pytest.mark.parametrize('encode_y_as_str', [True, False]) -def test_delayed_feature_extractor_maxdelay3_gap1(encode_X_as_str, encode_y_as_str, delayed_features_data): +@pytest.mark.parametrize("encode_X_as_str", [True, False]) +@pytest.mark.parametrize("encode_y_as_str", [True, False]) +def test_delayed_feature_extractor_maxdelay3_gap1( + encode_X_as_str, encode_y_as_str, delayed_features_data +): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) - answer = pd.DataFrame({"feature": X.feature, - "feature_delay_1": X_answer.feature.shift(1), - "feature_delay_2": X_answer.feature.shift(2), - "feature_delay_3": X_answer.feature.shift(3), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, encode_X_as_str, encode_y_as_str + ) + answer = pd.DataFrame( + { + "feature": X.feature, + "feature_delay_1": X_answer.feature.shift(1), + "feature_delay_2": X_answer.feature.shift(2), + "feature_delay_3": X_answer.feature.shift(3), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) if not encode_X_as_str: answer["feature"] = X.feature.astype("int64") if not encode_y_as_str: answer["target_delay_0"] = y_answer.astype("int64") - assert_frame_equal(answer, DelayedFeatureTransformer(max_delay=3, gap=1).fit_transform(X=X, y=y)) - - answer_only_y = pd.DataFrame({"target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) - assert_frame_equal(answer_only_y, DelayedFeatureTransformer(max_delay=3, gap=1).fit_transform(X=None, y=y)) - - -@pytest.mark.parametrize('encode_X_as_str', [True, False]) -@pytest.mark.parametrize('encode_y_as_str', [True, False]) -def test_delayed_feature_extractor_maxdelay5_gap1(encode_X_as_str, encode_y_as_str, delayed_features_data): + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=3, gap=1).fit_transform(X=X, y=y) + ) + + answer_only_y = pd.DataFrame( + { + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) + assert_frame_equal( + answer_only_y, + DelayedFeatureTransformer(max_delay=3, gap=1).fit_transform(X=None, y=y), + ) + + +@pytest.mark.parametrize("encode_X_as_str", [True, False]) +@pytest.mark.parametrize("encode_y_as_str", [True, False]) +def test_delayed_feature_extractor_maxdelay5_gap1( + encode_X_as_str, encode_y_as_str, delayed_features_data +): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) - answer = pd.DataFrame({"feature": X.feature, - "feature_delay_1": X_answer.feature.shift(1), - "feature_delay_2": X_answer.feature.shift(2), - "feature_delay_3": X_answer.feature.shift(3), - "feature_delay_4": X_answer.feature.shift(4), - "feature_delay_5": X_answer.feature.shift(5), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3), - "target_delay_4": y_answer.shift(4), - "target_delay_5": y_answer.shift(5)}) + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, encode_X_as_str, encode_y_as_str + ) + answer = pd.DataFrame( + { + "feature": X.feature, + "feature_delay_1": X_answer.feature.shift(1), + "feature_delay_2": X_answer.feature.shift(2), + "feature_delay_3": X_answer.feature.shift(3), + "feature_delay_4": X_answer.feature.shift(4), + "feature_delay_5": X_answer.feature.shift(5), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + "target_delay_4": y_answer.shift(4), + "target_delay_5": y_answer.shift(5), + } + ) if not encode_X_as_str: answer["feature"] = X.feature.astype("int64") - assert_frame_equal(answer, DelayedFeatureTransformer(max_delay=5, gap=1).fit_transform(X, y)) - - answer_only_y = pd.DataFrame({"target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3), - "target_delay_4": y_answer.shift(4), - "target_delay_5": y_answer.shift(5)}) - assert_frame_equal(answer_only_y, DelayedFeatureTransformer(max_delay=5, gap=1).fit_transform(X=None, y=y)) - - -@pytest.mark.parametrize('encode_X_as_str', [True, False]) -@pytest.mark.parametrize('encode_y_as_str', [True, False]) -def test_delayed_feature_extractor_maxdelay3_gap7(encode_X_as_str, encode_y_as_str, delayed_features_data): + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=5, gap=1).fit_transform(X, y) + ) + + answer_only_y = pd.DataFrame( + { + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + "target_delay_4": y_answer.shift(4), + "target_delay_5": y_answer.shift(5), + } + ) + assert_frame_equal( + answer_only_y, + DelayedFeatureTransformer(max_delay=5, gap=1).fit_transform(X=None, y=y), + ) + + +@pytest.mark.parametrize("encode_X_as_str", [True, False]) +@pytest.mark.parametrize("encode_y_as_str", [True, False]) +def test_delayed_feature_extractor_maxdelay3_gap7( + encode_X_as_str, encode_y_as_str, delayed_features_data +): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) - answer = pd.DataFrame({"feature": X.feature, - "feature_delay_1": X_answer.feature.shift(1), - "feature_delay_2": X_answer.feature.shift(2), - "feature_delay_3": X_answer.feature.shift(3), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, encode_X_as_str, encode_y_as_str + ) + answer = pd.DataFrame( + { + "feature": X.feature, + "feature_delay_1": X_answer.feature.shift(1), + "feature_delay_2": X_answer.feature.shift(2), + "feature_delay_3": X_answer.feature.shift(3), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) if not encode_X_as_str: answer["feature"] = X.feature.astype("int64") - assert_frame_equal(answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y)) - - answer_only_y = pd.DataFrame({"target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) - assert_frame_equal(answer_only_y, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y)) - - -@pytest.mark.parametrize('encode_X_as_str', [True, False]) -@pytest.mark.parametrize('encode_y_as_str', [True, False]) -def test_delayed_feature_extractor_numpy(encode_X_as_str, encode_y_as_str, delayed_features_data): + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y) + ) + + answer_only_y = pd.DataFrame( + { + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) + assert_frame_equal( + answer_only_y, + DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y), + ) + + +@pytest.mark.parametrize("encode_X_as_str", [True, False]) +@pytest.mark.parametrize("encode_y_as_str", [True, False]) +def test_delayed_feature_extractor_numpy( + encode_X_as_str, encode_y_as_str, delayed_features_data +): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, encode_X_as_str, encode_y_as_str + ) X_np = X.values y_np = y.values - answer = pd.DataFrame({0: X.feature, - "0_delay_1": X_answer.feature.shift(1), - "0_delay_2": X_answer.feature.shift(2), - "0_delay_3": X_answer.feature.shift(3), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) + answer = pd.DataFrame( + { + 0: X.feature, + "0_delay_1": X_answer.feature.shift(1), + "0_delay_2": X_answer.feature.shift(2), + "0_delay_3": X_answer.feature.shift(3), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) if not encode_X_as_str: answer[0] = X.feature.astype("int64") - assert_frame_equal(answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X_np, y_np)) - - answer_only_y = pd.DataFrame({"target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) - assert_frame_equal(answer_only_y, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y_np)) - - -@pytest.mark.parametrize("delay_features,delay_target", [(False, True), (True, False), (False, False)]) -@pytest.mark.parametrize('encode_X_as_str', [True, False]) -@pytest.mark.parametrize('encode_y_as_str', [True, False]) -def test_lagged_feature_extractor_delay_features_delay_target(encode_y_as_str, encode_X_as_str, delay_features, - delay_target, - delayed_features_data): + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X_np, y_np) + ) + + answer_only_y = pd.DataFrame( + { + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) + assert_frame_equal( + answer_only_y, + DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y_np), + ) + + +@pytest.mark.parametrize( + "delay_features,delay_target", [(False, True), (True, False), (False, False)] +) +@pytest.mark.parametrize("encode_X_as_str", [True, False]) +@pytest.mark.parametrize("encode_y_as_str", [True, False]) +def test_lagged_feature_extractor_delay_features_delay_target( + encode_y_as_str, + encode_X_as_str, + delay_features, + delay_target, + delayed_features_data, +): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) - all_delays = pd.DataFrame({"feature": X.feature, - "feature_delay_1": X_answer.feature.shift(1), - "feature_delay_2": X_answer.feature.shift(2), - "feature_delay_3": X_answer.feature.shift(3), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, encode_X_as_str, encode_y_as_str + ) + all_delays = pd.DataFrame( + { + "feature": X.feature, + "feature_delay_1": X_answer.feature.shift(1), + "feature_delay_2": X_answer.feature.shift(2), + "feature_delay_3": X_answer.feature.shift(3), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) if not encode_X_as_str: all_delays["feature"] = X.feature.astype("int64") if not delay_features: - all_delays = all_delays.drop(columns=[c for c in all_delays.columns if "feature_" in c]) + all_delays = all_delays.drop( + columns=[c for c in all_delays.columns if "feature_" in c] + ) if not delay_target: - all_delays = all_delays.drop(columns=[c for c in all_delays.columns if "target" in c]) + all_delays = all_delays.drop( + columns=[c for c in all_delays.columns if "target" in c] + ) - transformer = DelayedFeatureTransformer(max_delay=3, gap=1, - delay_features=delay_features, delay_target=delay_target) + transformer = DelayedFeatureTransformer( + max_delay=3, gap=1, delay_features=delay_features, delay_target=delay_target + ) assert_frame_equal(all_delays, transformer.fit_transform(X, y)) -@pytest.mark.parametrize("delay_features,delay_target", [(False, True), (True, False), (False, False)]) -@pytest.mark.parametrize('encode_X_as_str', [True, False]) -@pytest.mark.parametrize('encode_y_as_str', [True, False]) -def test_lagged_feature_extractor_delay_target(encode_y_as_str, encode_X_as_str, delay_features, - delay_target, delayed_features_data): +@pytest.mark.parametrize( + "delay_features,delay_target", [(False, True), (True, False), (False, False)] +) +@pytest.mark.parametrize("encode_X_as_str", [True, False]) +@pytest.mark.parametrize("encode_y_as_str", [True, False]) +def test_lagged_feature_extractor_delay_target( + encode_y_as_str, + encode_X_as_str, + delay_features, + delay_target, + delayed_features_data, +): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, encode_X_as_str, encode_y_as_str + ) answer = pd.DataFrame() if delay_target: - answer = pd.DataFrame({"target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}) - - transformer = DelayedFeatureTransformer(max_delay=3, gap=1, - delay_features=delay_features, delay_target=delay_target) + answer = pd.DataFrame( + { + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) + + transformer = DelayedFeatureTransformer( + max_delay=3, gap=1, delay_features=delay_features, delay_target=delay_target + ) assert_frame_equal(answer, transformer.fit_transform(None, y)) @pytest.mark.parametrize("gap", [0, 1, 7]) def test_target_delay_when_gap_is_0(gap, delayed_features_data): X, y = delayed_features_data - expected = pd.DataFrame({"feature": X.feature.astype("int64"), - "feature_delay_1": X.feature.shift(1), - "target_delay_0": y.astype("int64"), - "target_delay_1": y.shift(1)}) + expected = pd.DataFrame( + { + "feature": X.feature.astype("int64"), + "feature_delay_1": X.feature.shift(1), + "target_delay_0": y.astype("int64"), + "target_delay_1": y.shift(1), + } + ) if gap == 0: expected = expected.drop(columns=["target_delay_0"]) transformer = DelayedFeatureTransformer(max_delay=1, gap=gap) assert_frame_equal(expected, transformer.fit_transform(X, y)) - expected = pd.DataFrame({"target_delay_0": y.astype("int64"), - "target_delay_1": y.shift(1)}) + expected = pd.DataFrame( + {"target_delay_0": y.astype("int64"), "target_delay_1": y.shift(1)} + ) if gap == 0: expected = expected.drop(columns=["target_delay_0"]) assert_frame_equal(expected, transformer.fit_transform(None, y)) -@pytest.mark.parametrize('encode_X_as_str', [True, False]) -@pytest.mark.parametrize('encode_y_as_str', [True, False]) -@pytest.mark.parametrize('data_type', ['ww', 'pd']) -def test_delay_feature_transformer_supports_custom_index(encode_X_as_str, encode_y_as_str, data_type, make_data_type, - delayed_features_data): +@pytest.mark.parametrize("encode_X_as_str", [True, False]) +@pytest.mark.parametrize("encode_y_as_str", [True, False]) +@pytest.mark.parametrize("data_type", ["ww", "pd"]) +def test_delay_feature_transformer_supports_custom_index( + encode_X_as_str, encode_y_as_str, data_type, make_data_type, delayed_features_data +): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, encode_X_as_str, encode_y_as_str + ) X.index = pd.RangeIndex(50, 81) X_answer.index = pd.RangeIndex(50, 81) y.index = pd.RangeIndex(50, 81) y_answer.index = pd.RangeIndex(50, 81) - answer = pd.DataFrame({"feature": X.feature, - "feature_delay_1": X_answer.feature.shift(1), - "feature_delay_2": X_answer.feature.shift(2), - "feature_delay_3": X_answer.feature.shift(3), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81)) + answer = pd.DataFrame( + { + "feature": X.feature, + "feature_delay_1": X_answer.feature.shift(1), + "feature_delay_2": X_answer.feature.shift(2), + "feature_delay_3": X_answer.feature.shift(3), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + }, + index=pd.RangeIndex(50, 81), + ) if not encode_X_as_str: answer["feature"] = X.feature.astype("int64") X = make_data_type(data_type, X) y = make_data_type(data_type, y) - assert_frame_equal(answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y)) - - answer_only_y = pd.DataFrame({"target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81)) - assert_frame_equal(answer_only_y, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y)) + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y) + ) + + answer_only_y = pd.DataFrame( + { + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + }, + index=pd.RangeIndex(50, 81), + ) + assert_frame_equal( + answer_only_y, + DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y), + ) def test_delay_feature_transformer_multiple_categorical_columns(delayed_features_data): X, y = delayed_features_data X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, True, True) - X['feature_2'] = pd.Categorical(["a"] * 10 + ['aa'] * 10 + ['aaa'] * 10 + ['aaaa']) - X_answer['feature_2'] = pd.Series([0] * 10 + [1] * 10 + [2] * 10 + [3]) - answer = pd.DataFrame({"feature": X.feature, - 'feature_2': X.feature_2, - "feature_delay_1": X_answer.feature.shift(1), - "feature_2_delay_1": X_answer.feature_2.shift(1), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - }) - assert_frame_equal(answer, DelayedFeatureTransformer(max_delay=1, gap=11).fit_transform(X, y)) + X["feature_2"] = pd.Categorical(["a"] * 10 + ["aa"] * 10 + ["aaa"] * 10 + ["aaaa"]) + X_answer["feature_2"] = pd.Series([0] * 10 + [1] * 10 + [2] * 10 + [3]) + answer = pd.DataFrame( + { + "feature": X.feature, + "feature_2": X.feature_2, + "feature_delay_1": X_answer.feature.shift(1), + "feature_2_delay_1": X_answer.feature_2.shift(1), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + } + ) + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=1, gap=11).fit_transform(X, y) + ) def test_delay_feature_transformer_y_is_none(delayed_features_data): X, _ = delayed_features_data - answer = pd.DataFrame({"feature": X.feature.astype("int64"), - "feature_delay_1": X.feature.shift(1), - }) - assert_frame_equal(answer, DelayedFeatureTransformer(max_delay=1, gap=11).fit_transform(X, y=None)) + answer = pd.DataFrame( + { + "feature": X.feature.astype("int64"), + "feature_delay_1": X.feature.shift(1), + } + ) + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=1, gap=11).fit_transform(X, y=None) + ) def test_delayed_feature_transformer_does_not_modify_input_data(delayed_features_data): @@ -290,13 +426,27 @@ def test_delayed_feature_transformer_does_not_modify_input_data(delayed_features assert_frame_equal(X, expected) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.to_datetime(['20190902', '20200519', '20190607'], format='%Y%m%d')), - pd.DataFrame(pd.Series([1, 2, 3], dtype="int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) -@pytest.mark.parametrize('fit_transform', [True, False]) -def test_delay_feature_transformer_woodwork_custom_overrides_returned_by_components(X_df, fit_transform): +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame( + pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d") + ), + pd.DataFrame(pd.Series([1, 2, 3], dtype="int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) +@pytest.mark.parametrize("fit_transform", [True, False]) +def test_delay_feature_transformer_woodwork_custom_overrides_returned_by_components( + X_df, fit_transform +): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, Datetime, Boolean] for logical_type in override_types: @@ -313,17 +463,23 @@ def test_delay_feature_transformer_woodwork_custom_overrides_returned_by_compone transformed = dft.transform(X, y) assert isinstance(transformed, pd.DataFrame) if logical_type in [Integer, Double, Categorical]: - assert transformed.ww.logical_types == {0: logical_type, - '0_delay_1': Double, - 'target_delay_0': Integer, - 'target_delay_1': Double} + assert transformed.ww.logical_types == { + 0: logical_type, + "0_delay_1": Double, + "target_delay_0": Integer, + "target_delay_1": Double, + } elif logical_type == Boolean: - assert transformed.ww.logical_types == {0: logical_type, - '0_delay_1': Categorical, - 'target_delay_0': Integer, - 'target_delay_1': Double} + assert transformed.ww.logical_types == { + 0: logical_type, + "0_delay_1": Categorical, + "target_delay_0": Integer, + "target_delay_1": Double, + } else: - assert transformed.ww.logical_types == {0: logical_type, - '0_delay_1': logical_type, - 'target_delay_0': Integer, - 'target_delay_1': Double} + assert transformed.ww.logical_types == { + 0: logical_type, + "0_delay_1": logical_type, + "target_delay_0": Integer, + "target_delay_1": Double, + } diff --git a/evalml/tests/component_tests/test_drop_null_columns_transformer.py b/evalml/tests/component_tests/test_drop_null_columns_transformer.py index 193d3e6ea4..14f461c5dd 100644 --- a/evalml/tests/component_tests/test_drop_null_columns_transformer.py +++ b/evalml/tests/component_tests/test_drop_null_columns_transformer.py @@ -8,7 +8,7 @@ Categorical, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.pipelines.components import DropNullColumns @@ -27,27 +27,38 @@ def test_drop_null_transformer_init(): assert drop_null_transformer.parameters == {"pct_null_threshold": 0.95} assert drop_null_transformer._cols_to_drop is None - with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, + match="pct_null_threshold must be a float between 0 and 1, inclusive.", + ): DropNullColumns(pct_null_threshold=-0.95) - with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, + match="pct_null_threshold must be a float between 0 and 1, inclusive.", + ): DropNullColumns(pct_null_threshold=1.01) def test_drop_null_transformer_transform_default_pct_null_threshold(): drop_null_transformer = DropNullColumns() - X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'no_null': [1, 2, 3, 4, 5]}) - X_expected = X.astype({'lots_of_null': 'float64', 'no_null': 'int64'}) + X = pd.DataFrame( + {"lots_of_null": [None, None, None, None, 5], "no_null": [1, 2, 3, 4, 5]} + ) + X_expected = X.astype({"lots_of_null": "float64", "no_null": "int64"}) drop_null_transformer.fit(X) X_t = drop_null_transformer.transform(X) assert_frame_equal(X_expected, X_t) def test_drop_null_transformer_transform_custom_pct_null_threshold(): - X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 4, 5]}) + X = pd.DataFrame( + { + "lots_of_null": [None, None, None, None, 5], + "all_null": [None, None, None, None, None], + "no_null": [1, 2, 3, 4, 5], + } + ) drop_null_transformer = DropNullColumns(pct_null_threshold=0.5) X_expected = X.drop(["lots_of_null", "all_null"], axis=1) @@ -56,16 +67,26 @@ def test_drop_null_transformer_transform_custom_pct_null_threshold(): X_t = drop_null_transformer.transform(X) assert_frame_equal(X_expected, X_t) # check that X is untouched - assert X.equals(pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 4, 5]})) + assert X.equals( + pd.DataFrame( + { + "lots_of_null": [None, None, None, None, 5], + "all_null": [None, None, None, None, None], + "no_null": [1, 2, 3, 4, 5], + } + ) + ) def test_drop_null_transformer_transform_boundary_pct_null_threshold(): drop_null_transformer = DropNullColumns(pct_null_threshold=0.0) - X = pd.DataFrame({'all_null': [None, None, None, None, None], - 'lots_of_null': [None, None, None, None, 5], - 'some_null': [None, 0, 3, 4, 5]}) + X = pd.DataFrame( + { + "all_null": [None, None, None, None, None], + "lots_of_null": [None, None, None, None, 5], + "some_null": [None, 0, 3, 4, 5], + } + ) drop_null_transformer.fit(X) X_t = drop_null_transformer.transform(X) assert X_t.empty @@ -75,41 +96,63 @@ def test_drop_null_transformer_transform_boundary_pct_null_threshold(): X_t = drop_null_transformer.transform(X) assert_frame_equal(X_t, X.drop(["all_null"], axis=1)) # check that X is untouched - assert X.equals(pd.DataFrame({'all_null': [None, None, None, None, None], - 'lots_of_null': [None, None, None, None, 5], - 'some_null': [None, 0, 3, 4, 5]})) + assert X.equals( + pd.DataFrame( + { + "all_null": [None, None, None, None, None], + "lots_of_null": [None, None, None, None, 5], + "some_null": [None, 0, 3, 4, 5], + } + ) + ) def test_drop_null_transformer_fit_transform(): drop_null_transformer = DropNullColumns() - X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'no_null': [1, 2, 3, 4, 5]}) - X_expected = X.astype({'lots_of_null': 'float64', 'no_null': 'int64'}) + X = pd.DataFrame( + {"lots_of_null": [None, None, None, None, 5], "no_null": [1, 2, 3, 4, 5]} + ) + X_expected = X.astype({"lots_of_null": "float64", "no_null": "int64"}) X_t = drop_null_transformer.fit_transform(X) assert_frame_equal(X_expected, X_t) - X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 4, 5]}) + X = pd.DataFrame( + { + "lots_of_null": [None, None, None, None, 5], + "all_null": [None, None, None, None, None], + "no_null": [1, 2, 3, 4, 5], + } + ) drop_null_transformer = DropNullColumns(pct_null_threshold=0.5) X_expected = X.drop(["lots_of_null", "all_null"], axis=1) - X_expected = X_expected.astype({'no_null': 'int64'}) + X_expected = X_expected.astype({"no_null": "int64"}) X_t = drop_null_transformer.fit_transform(X) assert_frame_equal(X_expected, X_t) # check that X is untouched - assert X.equals(pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 4, 5]})) + assert X.equals( + pd.DataFrame( + { + "lots_of_null": [None, None, None, None, 5], + "all_null": [None, None, None, None, None], + "no_null": [1, 2, 3, 4, 5], + } + ) + ) drop_null_transformer = DropNullColumns(pct_null_threshold=0.0) - X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'some_null': [None, 0, 3, 4, 5]}) + X = pd.DataFrame( + {"lots_of_null": [None, None, None, None, 5], "some_null": [None, 0, 3, 4, 5]} + ) X_t = drop_null_transformer.fit_transform(X) assert X_t.empty - X = pd.DataFrame({'all_null': [None, None, None, None, None], - 'lots_of_null': [None, None, None, None, 5], - 'some_null': [None, 0, 3, 4, 5]}) + X = pd.DataFrame( + { + "all_null": [None, None, None, None, None], + "lots_of_null": [None, None, None, None, 5], + "some_null": [None, 0, 3, 4, 5], + } + ) drop_null_transformer = DropNullColumns(pct_null_threshold=1.0) X_t = drop_null_transformer.fit_transform(X) assert_frame_equal(X.drop(["all_null"], axis=1), X_t) @@ -117,30 +160,53 @@ def test_drop_null_transformer_fit_transform(): def test_drop_null_transformer_np_array(): drop_null_transformer = DropNullColumns(pct_null_threshold=0.5) - X = np.array([[np.nan, 0, 2, 0], - [np.nan, 1, np.nan, 0], - [np.nan, 2, np.nan, 0], - [np.nan, 1, 1, 0]]) + X = np.array( + [ + [np.nan, 0, 2, 0], + [np.nan, 1, np.nan, 0], + [np.nan, 2, np.nan, 0], + [np.nan, 1, 1, 0], + ] + ) X_t = drop_null_transformer.fit_transform(X) assert_frame_equal(X_t, pd.DataFrame(np.delete(X, [0, 2], axis=1), columns=[1, 3])) # check that X is untouched - np.testing.assert_allclose(X, np.array([[np.nan, 0, 2, 0], - [np.nan, 1, np.nan, 0], - [np.nan, 2, np.nan, 0], - [np.nan, 1, 1, 0]])) - - -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) + np.testing.assert_allclose( + X, + np.array( + [ + [np.nan, 0, 2, 0], + [np.nan, 1, np.nan, 0], + [np.nan, 2, np.nan, 0], + [np.nan, 1, 1, 0], + ] + ), + ) + + +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) @pytest.mark.parametrize("has_nan", [True, False]) -def test_drop_null_transformer_woodwork_custom_overrides_returned_by_components(X_df, has_nan): +def test_drop_null_transformer_woodwork_custom_overrides_returned_by_components( + X_df, has_nan +): y = pd.Series([1, 2, 1]) if has_nan: - X_df['all null'] = [np.nan, np.nan, np.nan] + X_df["all null"] = [np.nan, np.nan, np.nan] override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean] for logical_type in override_types: try: diff --git a/evalml/tests/component_tests/test_en_classifier.py b/evalml/tests/component_tests/test_en_classifier.py index 79288445c6..01f65731af 100644 --- a/evalml/tests/component_tests/test_en_classifier.py +++ b/evalml/tests/component_tests/test_en_classifier.py @@ -3,9 +3,7 @@ from sklearn.linear_model import SGDClassifier as SKElasticNetClassifier from evalml.model_family import ModelFamily -from evalml.pipelines.components.estimators.classifiers import ( - ElasticNetClassifier -) +from evalml.pipelines.components.estimators.classifiers import ElasticNetClassifier from evalml.problem_types import ProblemTypes @@ -14,20 +12,25 @@ def test_model_family(): def test_problem_types(): - assert set(ElasticNetClassifier.supported_problem_types) == {ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS} + assert set(ElasticNetClassifier.supported_problem_types) == { + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + } def test_fit_predict_binary(X_y_binary): X, y = X_y_binary - sk_clf = SKElasticNetClassifier(loss="log", - penalty="elasticnet", - alpha=0.0001, - l1_ratio=0.15, - n_jobs=-1, - random_state=0) + sk_clf = SKElasticNetClassifier( + loss="log", + penalty="elasticnet", + alpha=0.0001, + l1_ratio=0.15, + n_jobs=-1, + random_state=0, + ) sk_clf.fit(X, y) y_pred_sk = sk_clf.predict(X) y_pred_proba_sk = sk_clf.predict_proba(X) @@ -44,12 +47,14 @@ def test_fit_predict_binary(X_y_binary): def test_fit_predict_multi(X_y_multi): X, y = X_y_multi - sk_clf = SKElasticNetClassifier(loss="log", - penalty="elasticnet", - alpha=0.0001, - l1_ratio=0.15, - n_jobs=-1, - random_state=0) + sk_clf = SKElasticNetClassifier( + loss="log", + penalty="elasticnet", + alpha=0.0001, + l1_ratio=0.15, + n_jobs=-1, + random_state=0, + ) sk_clf.fit(X, y) y_pred_sk = sk_clf.predict(X) y_pred_proba_sk = sk_clf.predict_proba(X) @@ -68,29 +73,35 @@ def test_fit_predict_multi(X_y_multi): def test_feature_importance(X_y_binary): X, y = X_y_binary - sk_clf = SKElasticNetClassifier(loss="log", - penalty="elasticnet", - alpha=0.0001, - l1_ratio=0.15, - n_jobs=1, - random_state=0) + sk_clf = SKElasticNetClassifier( + loss="log", + penalty="elasticnet", + alpha=0.0001, + l1_ratio=0.15, + n_jobs=1, + random_state=0, + ) sk_clf.fit(X, y) clf = ElasticNetClassifier(n_jobs=1) clf.fit(X, y) - np.testing.assert_almost_equal(sk_clf.coef_.flatten(), clf.feature_importance, decimal=5) + np.testing.assert_almost_equal( + sk_clf.coef_.flatten(), clf.feature_importance, decimal=5 + ) def test_feature_importance_multi(X_y_multi): X, y = X_y_multi - sk_clf = SKElasticNetClassifier(loss="log", - penalty="elasticnet", - alpha=0.0001, - l1_ratio=0.15, - n_jobs=1, - random_state=0) + sk_clf = SKElasticNetClassifier( + loss="log", + penalty="elasticnet", + alpha=0.0001, + l1_ratio=0.15, + n_jobs=1, + random_state=0, + ) sk_clf.fit(X, y) clf = ElasticNetClassifier(n_jobs=1) @@ -108,7 +119,9 @@ def test_overwrite_loss_parameter_in_kwargs(): assert len(warnings) == 1 # check that the message matches - assert warnings[0].message.args[0] == ("Parameter loss is being set to 'log' so that ElasticNetClassifier can predict probabilities" - ". Originally received 'hinge'.") + assert warnings[0].message.args[0] == ( + "Parameter loss is being set to 'log' so that ElasticNetClassifier can predict probabilities" + ". Originally received 'hinge'." + ) - assert en.parameters['loss'] == 'log' + assert en.parameters["loss"] == "log" diff --git a/evalml/tests/component_tests/test_en_regressor.py b/evalml/tests/component_tests/test_en_regressor.py index c6fbd82824..f7eb028016 100644 --- a/evalml/tests/component_tests/test_en_regressor.py +++ b/evalml/tests/component_tests/test_en_regressor.py @@ -2,9 +2,7 @@ from sklearn.linear_model import ElasticNet as SKElasticNetRegressor from evalml.model_family import ModelFamily -from evalml.pipelines.components.estimators.regressors import ( - ElasticNetRegressor -) +from evalml.pipelines.components.estimators.regressors import ElasticNetRegressor from evalml.problem_types import ProblemTypes @@ -17,25 +15,25 @@ def test_en_parameters(): expected_parameters = { "alpha": 0.75, "l1_ratio": 0.5, - 'max_iter': 1000, - 'normalize': False + "max_iter": 1000, + "normalize": False, } assert clf.parameters == expected_parameters def test_problem_types(): - assert set(ElasticNetRegressor.supported_problem_types) == {ProblemTypes.REGRESSION, - ProblemTypes.TIME_SERIES_REGRESSION} + assert set(ElasticNetRegressor.supported_problem_types) == { + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + } def test_fit_predict(X_y_regression): X, y = X_y_regression - sk_clf = SKElasticNetRegressor(alpha=0.0001, - l1_ratio=0.15, - random_state=0, - normalize=False, - max_iter=1000) + sk_clf = SKElasticNetRegressor( + alpha=0.0001, l1_ratio=0.15, random_state=0, normalize=False, max_iter=1000 + ) sk_clf.fit(X, y) y_pred_sk = sk_clf.predict(X) @@ -50,11 +48,9 @@ def test_fit_predict(X_y_regression): def test_feature_importance(X_y_regression): X, y = X_y_regression - sk_clf = SKElasticNetRegressor(alpha=0.0001, - l1_ratio=0.15, - random_state=0, - normalize=False, - max_iter=1000) + sk_clf = SKElasticNetRegressor( + alpha=0.0001, l1_ratio=0.15, random_state=0, normalize=False, max_iter=1000 + ) sk_clf.fit(X, y) clf = ElasticNetRegressor() diff --git a/evalml/tests/component_tests/test_estimators.py b/evalml/tests/component_tests/test_estimators.py index a34b16f71e..169e2bb74c 100644 --- a/evalml/tests/component_tests/test_estimators.py +++ b/evalml/tests/component_tests/test_estimators.py @@ -10,23 +10,27 @@ from evalml.pipelines.components import Estimator from evalml.pipelines.components.utils import ( _all_estimators_used_in_search, - get_estimators + get_estimators, ) from evalml.problem_types import ( ProblemTypes, handle_problem_types, is_binary, is_multiclass, - is_regression + is_regression, ) from evalml.utils import get_random_state -def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, ts_data, helper_functions): +def test_estimators_feature_name_with_random_ascii( + X_y_binary, X_y_multi, X_y_regression, ts_data, helper_functions +): for estimator_class in _all_estimators_used_in_search(): - if estimator_class.__name__ == 'ARIMARegressor': + if estimator_class.__name__ == "ARIMARegressor": continue - supported_problem_types = [handle_problem_types(pt) for pt in estimator_class.supported_problem_types] + supported_problem_types = [ + handle_problem_types(pt) for pt in estimator_class.supported_problem_types + ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1(estimator_class) if is_binary(problem_type): @@ -36,8 +40,12 @@ def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_re elif is_regression(problem_type): X, y = X_y_regression - X = get_random_state(clf.random_seed).random((X.shape[0], len(string.printable))) - col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable] + X = get_random_state(clf.random_seed).random( + (X.shape[0], len(string.printable)) + ) + col_names = [ + "column_{}".format(ascii_char) for ascii_char in string.printable + ] X = pd.DataFrame(X, columns=col_names) assert clf.input_feature_names is None clf.fit(X, y) @@ -46,20 +54,28 @@ def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_re predictions = clf.predict(X) assert len(predictions) == len(y) assert not np.isnan(predictions).all() - assert (clf.input_feature_names == col_names) + assert clf.input_feature_names == col_names def test_binary_classification_estimators_predict_proba_col_order(helper_functions): - X = pd.DataFrame({'input': np.concatenate([np.array([-1] * 100), np.array([1] * 100)])}) + X = pd.DataFrame( + {"input": np.concatenate([np.array([-1] * 100), np.array([1] * 100)])} + ) data = np.concatenate([np.zeros(100), np.ones(100)]) y = pd.Series(data) for estimator_class in _all_estimators_used_in_search(): - supported_problem_types = [handle_problem_types(pt) for pt in estimator_class.supported_problem_types] + supported_problem_types = [ + handle_problem_types(pt) for pt in estimator_class.supported_problem_types + ] if ProblemTypes.BINARY in supported_problem_types: - estimator = helper_functions.safe_init_component_with_njobs_1(estimator_class) + estimator = helper_functions.safe_init_component_with_njobs_1( + estimator_class + ) estimator.fit(X, y) predicted_proba = estimator.predict_proba(X) - expected = np.concatenate([(1 - data).reshape(-1, 1), data.reshape(-1, 1)], axis=1) + expected = np.concatenate( + [(1 - data).reshape(-1, 1), data.reshape(-1, 1)], axis=1 + ) np.testing.assert_allclose(expected, np.round(predicted_proba).values) @@ -67,20 +83,22 @@ def test_estimator_equality_different_supported_problem_types(): class MockEstimator(Estimator): name = "Mock Estimator" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] mock_estimator = MockEstimator() - mock_estimator.supported_problem_types = ['binary', 'multiclass'] + mock_estimator.supported_problem_types = ["binary", "multiclass"] assert mock_estimator != MockEstimator() - assert 'Mock Estimator' != mock_estimator + assert "Mock Estimator" != mock_estimator -@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww']) -def test_all_estimators_check_fit_input_type(data_type, X_y_binary, make_data_type, helper_functions): +@pytest.mark.parametrize("data_type", ["li", "np", "pd", "ww"]) +def test_all_estimators_check_fit_input_type( + data_type, X_y_binary, make_data_type, helper_functions +): X, y = X_y_binary X = make_data_type(data_type, X) y = make_data_type(data_type, y) - estimators_to_check = [estimator for estimator in get_estimators('binary')] + estimators_to_check = [estimator for estimator in get_estimators("binary")] for component_class in estimators_to_check: component = helper_functions.safe_init_component_with_njobs_1(component_class) component.fit(X, y) @@ -88,12 +106,14 @@ def test_all_estimators_check_fit_input_type(data_type, X_y_binary, make_data_ty component.predict_proba(X) -@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww']) -def test_all_estimators_check_fit_input_type_regression(data_type, X_y_regression, make_data_type, helper_functions): +@pytest.mark.parametrize("data_type", ["li", "np", "pd", "ww"]) +def test_all_estimators_check_fit_input_type_regression( + data_type, X_y_regression, make_data_type, helper_functions +): X, y = X_y_regression X = make_data_type(data_type, X) y = make_data_type(data_type, y) - estimators_to_check = [estimator for estimator in get_estimators('regression')] + estimators_to_check = [estimator for estimator in get_estimators("regression")] for component_class in estimators_to_check: component = helper_functions.safe_init_component_with_njobs_1(component_class) component.fit(X, y) @@ -108,50 +128,97 @@ def test_estimator_predict_output_type(X_y_binary, ts_data, helper_functions): y_list = list(y_np) X_df_no_col_names = pd.DataFrame(X_np) range_index = pd.RangeIndex(start=0, stop=X_np.shape[1], step=1) - X_df_with_col_names = pd.DataFrame(X_np, columns=['x' + str(i) for i in range(X_np.shape[1])]) + X_df_with_col_names = pd.DataFrame( + X_np, columns=["x" + str(i) for i in range(X_np.shape[1])] + ) y_series_no_name = pd.Series(y_np) - y_series_with_name = pd.Series(y_np, name='target') - X_df_no_col_names_ts = pd.DataFrame(data=X_df_no_col_names.values, columns=X_df_no_col_names.columns, - index=pd.date_range(start='1/1/2018', periods=X_df_no_col_names.shape[0])) - X_df_with_col_names_ts = pd.DataFrame(data=X_df_with_col_names.values, - columns=['x' + str(i) for i in range(X_np.shape[1])], - index=pd.date_range(start='1/1/2018', periods=X_df_with_col_names.shape[0])) - - datatype_combos = [(X_np, y_np, range_index, np.unique(y_np), False), - (X_np, y_list, range_index, np.unique(y_np), False), - (X_df_no_col_names, y_series_no_name, range_index, y_series_no_name.unique(), False), - (X_df_with_col_names, y_series_with_name, X_df_with_col_names.columns, y_series_with_name.unique(), False), - (X_df_no_col_names_ts, y_series_no_name, range_index, y_series_no_name.unique(), True), - (X_df_with_col_names_ts, y_series_with_name, X_df_with_col_names_ts.columns, y_series_with_name.unique(), True)] + y_series_with_name = pd.Series(y_np, name="target") + X_df_no_col_names_ts = pd.DataFrame( + data=X_df_no_col_names.values, + columns=X_df_no_col_names.columns, + index=pd.date_range(start="1/1/2018", periods=X_df_no_col_names.shape[0]), + ) + X_df_with_col_names_ts = pd.DataFrame( + data=X_df_with_col_names.values, + columns=["x" + str(i) for i in range(X_np.shape[1])], + index=pd.date_range(start="1/1/2018", periods=X_df_with_col_names.shape[0]), + ) + + datatype_combos = [ + (X_np, y_np, range_index, np.unique(y_np), False), + (X_np, y_list, range_index, np.unique(y_np), False), + ( + X_df_no_col_names, + y_series_no_name, + range_index, + y_series_no_name.unique(), + False, + ), + ( + X_df_with_col_names, + y_series_with_name, + X_df_with_col_names.columns, + y_series_with_name.unique(), + False, + ), + ( + X_df_no_col_names_ts, + y_series_no_name, + range_index, + y_series_no_name.unique(), + True, + ), + ( + X_df_with_col_names_ts, + y_series_with_name, + X_df_with_col_names_ts.columns, + y_series_with_name.unique(), + True, + ), + ] for component_class in _all_estimators_used_in_search(): for X, y, X_cols_expected, y_cols_expected, time_series in datatype_combos: - if component_class.name == 'ARIMA Regressor' and not time_series: + if component_class.name == "ARIMA Regressor" and not time_series: continue - elif component_class.name != 'ARIMA Regressor' and time_series: + elif component_class.name != "ARIMA Regressor" and time_series: continue - print('Checking output of predict for estimator "{}" on X type {} cols {}, y type {} name {}' - .format(component_class.name, type(X), - X.columns if isinstance(X, pd.DataFrame) else None, type(y), - y.name if isinstance(y, pd.Series) else None)) - component = helper_functions.safe_init_component_with_njobs_1(component_class) + print( + 'Checking output of predict for estimator "{}" on X type {} cols {}, y type {} name {}'.format( + component_class.name, + type(X), + X.columns if isinstance(X, pd.DataFrame) else None, + type(y), + y.name if isinstance(y, pd.Series) else None, + ) + ) + component = helper_functions.safe_init_component_with_njobs_1( + component_class + ) component.fit(X, y=y) predict_output = component.predict(X) assert isinstance(predict_output, pd.Series) assert len(predict_output) == len(y) - if component_class.name == 'ARIMA Regressor': - assert predict_output.name == 'predicted_mean' + if component_class.name == "ARIMA Regressor": + assert predict_output.name == "predicted_mean" else: assert predict_output.name is None - if not ((ProblemTypes.BINARY in component_class.supported_problem_types) or - (ProblemTypes.MULTICLASS in component_class.supported_problem_types)): + if not ( + (ProblemTypes.BINARY in component_class.supported_problem_types) + or (ProblemTypes.MULTICLASS in component_class.supported_problem_types) + ): continue - print('Checking output of predict_proba for estimator "{}" on X type {} cols {}, y type {} name {}' - .format(component_class.name, type(X), - X.columns if isinstance(X, pd.DataFrame) else None, type(y), - y.name if isinstance(y, pd.Series) else None)) + print( + 'Checking output of predict_proba for estimator "{}" on X type {} cols {}, y type {} name {}'.format( + component_class.name, + type(X), + X.columns if isinstance(X, pd.DataFrame) else None, + type(y), + y.name if isinstance(y, pd.Series) else None, + ) + ) predict_proba_output = component.predict_proba(X) assert isinstance(predict_proba_output, pd.DataFrame) assert predict_proba_output.shape == (len(y), len(np.unique(y))) @@ -162,7 +229,7 @@ def test_estimator_check_for_fit_with_overrides(X_y_binary): class MockEstimatorWithOverrides(Estimator): name = "Mock Estimator" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] def fit(self, X, y): pass @@ -176,7 +243,7 @@ def predict_proba(self, X): class MockEstimatorWithOverridesSubclass(Estimator): name = "Mock Estimator Subclass" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] def fit(self, X, y): pass @@ -191,9 +258,9 @@ def predict_proba(self, X): est = MockEstimatorWithOverrides() est_subclass = MockEstimatorWithOverridesSubclass() - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): est.predict(X) - with pytest.raises(ComponentNotYetFittedError, match='You must fit'): + with pytest.raises(ComponentNotYetFittedError, match="You must fit"): est_subclass.predict(X) est.fit(X, y) @@ -215,7 +282,7 @@ def test_estimator_manage_woodwork(X_y_binary): class MockEstimator(Estimator): name = "Mock Estimator Subclass" model_family = ModelFamily.LINEAR_MODEL - supported_problem_types = ['binary'] + supported_problem_types = ["binary"] # Test y is None case est = MockEstimator() diff --git a/evalml/tests/component_tests/test_et_classifier.py b/evalml/tests/component_tests/test_et_classifier.py index 7d0e9e1fed..d3d9d2bbfd 100644 --- a/evalml/tests/component_tests/test_et_classifier.py +++ b/evalml/tests/component_tests/test_et_classifier.py @@ -11,9 +11,12 @@ def test_model_family(): def test_problem_types(): - assert set(ExtraTreesClassifier.supported_problem_types) == {ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS} + assert set(ExtraTreesClassifier.supported_problem_types) == { + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + } def test_fit_predict_binary(X_y_binary): diff --git a/evalml/tests/component_tests/test_et_regressor.py b/evalml/tests/component_tests/test_et_regressor.py index ab86df4fef..d092c557cb 100644 --- a/evalml/tests/component_tests/test_et_regressor.py +++ b/evalml/tests/component_tests/test_et_regressor.py @@ -11,8 +11,10 @@ def test_model_family(): def test_problem_types(): - assert set(ExtraTreesRegressor.supported_problem_types) == {ProblemTypes.REGRESSION, - ProblemTypes.TIME_SERIES_REGRESSION} + assert set(ExtraTreesRegressor.supported_problem_types) == { + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + } def test_fit_predict(X_y_regression): diff --git a/evalml/tests/component_tests/test_feature_selectors.py b/evalml/tests/component_tests/test_feature_selectors.py index dbf70ebaa4..e4f7170518 100644 --- a/evalml/tests/component_tests/test_feature_selectors.py +++ b/evalml/tests/component_tests/test_feature_selectors.py @@ -8,7 +8,7 @@ ComponentBase, FeatureSelector, RFClassifierSelectFromModel, - RFRegressorSelectFromModel + RFRegressorSelectFromModel, ) @@ -56,9 +56,15 @@ def fit(self, X, y): mock_feature_selector = MockFeatureSelector() mock_feature_selector.fit(pd.DataFrame(), pd.Series()) - with pytest.raises(MethodPropertyNotFoundError, match="Feature selector requires a transform method or a component_obj that implements transform"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Feature selector requires a transform method or a component_obj that implements transform", + ): mock_feature_selector.transform(pd.DataFrame()) - with pytest.raises(MethodPropertyNotFoundError, match="Feature selector requires a transform method or a component_obj that implements transform"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Feature selector requires a transform method or a component_obj that implements transform", + ): mock_feature_selector.fit_transform(pd.DataFrame()) @@ -74,22 +80,40 @@ def fit(self, X, y): mock_feature_selector = MockFeatureSelector() mock_feature_selector.fit(pd.DataFrame(), pd.Series()) - with pytest.raises(MethodPropertyNotFoundError, match="Feature selector requires a transform method or a component_obj that implements transform"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Feature selector requires a transform method or a component_obj that implements transform", + ): mock_feature_selector.transform(pd.DataFrame()) - with pytest.raises(MethodPropertyNotFoundError, match="Feature selector requires a transform method or a component_obj that implements transform"): + with pytest.raises( + MethodPropertyNotFoundError, + match="Feature selector requires a transform method or a component_obj that implements transform", + ): mock_feature_selector.fit_transform(pd.DataFrame()) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.to_datetime(['20190902', '20200519', '20190607'], format='%Y%m%d')), - pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame( + pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d") + ), + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) def test_feature_selectors_woodwork_custom_overrides_returned_by_components(X_df): rf_classifier, rf_regressor = make_rf_feature_selectors() y = pd.Series([1, 2, 1]) - X_df['another column'] = pd.Series([1., 2., 3.], dtype="float") + X_df["another column"] = pd.Series([1.0, 2.0, 3.0], dtype="float") override_types = [Integer, Double, Boolean] for logical_type in override_types: try: @@ -101,9 +125,15 @@ def test_feature_selectors_woodwork_custom_overrides_returned_by_components(X_df rf_classifier.fit(X, y) transformed = rf_classifier.transform(X, y) assert isinstance(transformed, pd.DataFrame) - assert transformed.ww.logical_types == {0: logical_type, 'another column': Double} + assert transformed.ww.logical_types == { + 0: logical_type, + "another column": Double, + } rf_regressor.fit(X, y) transformed = rf_regressor.transform(X, y) assert isinstance(transformed, pd.DataFrame) - assert transformed.ww.logical_types == {0: logical_type, 'another column': Double} + assert transformed.ww.logical_types == { + 0: logical_type, + "another column": Double, + } diff --git a/evalml/tests/component_tests/test_featuretools.py b/evalml/tests/component_tests/test_featuretools.py index 7fbc20a8c2..3eff41ff3f 100644 --- a/evalml/tests/component_tests/test_featuretools.py +++ b/evalml/tests/component_tests/test_featuretools.py @@ -5,13 +5,7 @@ import pytest import woodwork as ww from pandas.testing import assert_frame_equal -from woodwork.logical_types import ( - Boolean, - Categorical, - Datetime, - Double, - Integer -) +from woodwork.logical_types import Boolean, Categorical, Datetime, Double, Integer from evalml.pipelines.components import DFSTransformer @@ -33,31 +27,37 @@ def test_numeric_columns(X_y_multi): feature.transform(X_pd) -@patch('evalml.pipelines.components.transformers.preprocessing.featuretools.dfs') -@patch('evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix') +@patch("evalml.pipelines.components.transformers.preprocessing.featuretools.dfs") +@patch( + "evalml.pipelines.components.transformers.preprocessing.featuretools.calculate_feature_matrix" +) def test_featuretools_index(mock_calculate_feature_matrix, mock_dfs, X_y_multi): X, y = X_y_multi X_pd = pd.DataFrame(X) X_new_index = X_pd.copy() index = [i for i in range(len(X))] new_index = [i * 2 for i in index] - X_new_index['index'] = new_index + X_new_index["index"] = new_index mock_calculate_feature_matrix.return_value = pd.DataFrame({}) # check if _make_entity_set keeps the intended index feature = DFSTransformer() feature.fit(X_new_index) feature.transform(X_new_index) - arg_es = mock_dfs.call_args[1]['entityset'].entities[0].df['index'] - arg_tr = mock_calculate_feature_matrix.call_args[1]['entityset'].entities[0].df['index'] + arg_es = mock_dfs.call_args[1]["entityset"].entities[0].df["index"] + arg_tr = ( + mock_calculate_feature_matrix.call_args[1]["entityset"].entities[0].df["index"] + ) assert arg_es.to_list() == new_index assert arg_tr.to_list() == new_index # check if _make_entity_set fills in the proper index values feature.fit(X_pd) feature.transform(X_pd) - arg_es = mock_dfs.call_args[1]['entityset'].entities[0].df['index'] - arg_tr = mock_calculate_feature_matrix.call_args[1]['entityset'].entities[0].df['index'] + arg_es = mock_dfs.call_args[1]["entityset"].entities[0].df["index"] + arg_tr = ( + mock_calculate_feature_matrix.call_args[1]["entityset"].entities[0].df["index"] + ) assert arg_es.to_list() == index assert arg_tr.to_list() == index @@ -69,7 +69,9 @@ def test_transform(X_y_binary, X_y_multi, X_y_regression): X_pd = pd.DataFrame(X) X_pd.columns = X_pd.columns.astype(str) es = ft.EntitySet() - es = es.entity_from_dataframe(entity_id="X", dataframe=X_pd, index='index', make_index=True) + es = es.entity_from_dataframe( + entity_id="X", dataframe=X_pd, index="index", make_index=True + ) feature_matrix, features = ft.dfs(entityset=es, target_entity="X") feature = DFSTransformer() @@ -94,10 +96,12 @@ def test_transform_subset(X_y_binary, X_y_multi, X_y_regression): X_pd = pd.DataFrame(X) X_pd.columns = X_pd.columns.astype(str) X_fit = X_pd.iloc[: len(X) // 3] - X_transform = X_pd.iloc[len(X) // 3:] + X_transform = X_pd.iloc[len(X) // 3 :] es = ft.EntitySet() - es = es.entity_from_dataframe(entity_id="X", dataframe=X_transform, index='index', make_index=True) + es = es.entity_from_dataframe( + entity_id="X", dataframe=X_transform, index="index", make_index=True + ) feature_matrix, features = ft.dfs(entityset=es, target_entity="X") feature = DFSTransformer() @@ -107,10 +111,17 @@ def test_transform_subset(X_y_binary, X_y_multi, X_y_regression): assert_frame_equal(feature_matrix, X_t) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.to_datetime(['20190902', '20200519', '20190607'], format='%Y%m%d')), - pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame( + pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d") + ), + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + ], +) def test_ft_woodwork_custom_overrides_returned_by_components(X_df): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, Datetime, Boolean] @@ -126,12 +137,17 @@ def test_ft_woodwork_custom_overrides_returned_by_components(X_df): transformed = dft.transform(X, y) assert isinstance(transformed, pd.DataFrame) if logical_type == Datetime: - assert transformed.ww.logical_types == {'DAY(0)': Integer, 'MONTH(0)': Integer, 'WEEKDAY(0)': Integer, 'YEAR(0)': Integer} + assert transformed.ww.logical_types == { + "DAY(0)": Integer, + "MONTH(0)": Integer, + "WEEKDAY(0)": Integer, + "YEAR(0)": Integer, + } else: - assert transformed.ww.logical_types == {'0': logical_type} + assert transformed.ww.logical_types == {"0": logical_type} -@patch('evalml.pipelines.components.transformers.preprocessing.featuretools.dfs') +@patch("evalml.pipelines.components.transformers.preprocessing.featuretools.dfs") def test_dfs_sets_max_depth_1(mock_dfs, X_y_multi): X, y = X_y_multi X_pd = pd.DataFrame(X) @@ -139,4 +155,4 @@ def test_dfs_sets_max_depth_1(mock_dfs, X_y_multi): feature = DFSTransformer() feature.fit(X_pd, y) _, kwargs = mock_dfs.call_args - assert kwargs['max_depth'] == 1 + assert kwargs["max_depth"] == 1 diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py index b24750be86..c29df95e63 100644 --- a/evalml/tests/component_tests/test_imputer.py +++ b/evalml/tests/component_tests/test_imputer.py @@ -8,7 +8,7 @@ Categorical, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.pipelines.components import Imputer @@ -16,20 +16,30 @@ @pytest.fixture def imputer_test_data(): - return pd.DataFrame({ - "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'), - "int col": [0, 1, 2, 0, 3], - "object col": ["b", "b", "a", "c", "d"], - "float col": [0.0, 1.0, 0.0, -2.0, 5.], - "bool col": [True, False, False, True, True], - "categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'), - "int with nan": [np.nan, 1, 0, 0, 1], - "float with nan": [0.0, 1.0, np.nan, -1.0, 0.], - "object with nan": ["b", "b", np.nan, "c", np.nan], - "bool col with nan": pd.Series([True, np.nan, False, np.nan, True], dtype='category'), - "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan], - "all nan cat": pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], dtype='category') - }) + return pd.DataFrame( + { + "categorical col": pd.Series( + ["zero", "one", "two", "zero", "three"], dtype="category" + ), + "int col": [0, 1, 2, 0, 3], + "object col": ["b", "b", "a", "c", "d"], + "float col": [0.0, 1.0, 0.0, -2.0, 5.0], + "bool col": [True, False, False, True, True], + "categorical with nan": pd.Series( + [np.nan, "1", np.nan, "0", "3"], dtype="category" + ), + "int with nan": [np.nan, 1, 0, 0, 1], + "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0], + "object with nan": ["b", "b", np.nan, "c", np.nan], + "bool col with nan": pd.Series( + [True, np.nan, False, np.nan, True], dtype="category" + ), + "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan], + "all nan cat": pd.Series( + [np.nan, np.nan, np.nan, np.nan, np.nan], dtype="category" + ), + } + ) def test_invalid_strategy_parameters(): @@ -42,31 +52,35 @@ def test_invalid_strategy_parameters(): def test_imputer_default_parameters(): imputer = Imputer() expected_parameters = { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, } assert imputer.parameters == expected_parameters @pytest.mark.parametrize("categorical_impute_strategy", ["most_frequent", "constant"]) -@pytest.mark.parametrize("numeric_impute_strategy", ["mean", "median", "most_frequent", "constant"]) +@pytest.mark.parametrize( + "numeric_impute_strategy", ["mean", "median", "most_frequent", "constant"] +) def test_imputer_init(categorical_impute_strategy, numeric_impute_strategy): - imputer = Imputer(categorical_impute_strategy=categorical_impute_strategy, - numeric_impute_strategy=numeric_impute_strategy, - categorical_fill_value="str_fill_value", - numeric_fill_value=-1) + imputer = Imputer( + categorical_impute_strategy=categorical_impute_strategy, + numeric_impute_strategy=numeric_impute_strategy, + categorical_fill_value="str_fill_value", + numeric_fill_value=-1, + ) expected_parameters = { - 'categorical_impute_strategy': categorical_impute_strategy, - 'numeric_impute_strategy': numeric_impute_strategy, - 'categorical_fill_value': 'str_fill_value', - 'numeric_fill_value': -1 + "categorical_impute_strategy": categorical_impute_strategy, + "numeric_impute_strategy": numeric_impute_strategy, + "categorical_fill_value": "str_fill_value", + "numeric_fill_value": -1, } expected_hyperparameters = { "categorical_impute_strategy": ["most_frequent"], - "numeric_impute_strategy": ["mean", "median", "most_frequent"] + "numeric_impute_strategy": ["mean", "median", "most_frequent"], } assert imputer.name == "Imputer" assert imputer.parameters == expected_parameters @@ -74,18 +88,21 @@ def test_imputer_init(categorical_impute_strategy, numeric_impute_strategy): def test_numeric_only_input(imputer_test_data): - X = imputer_test_data[["int col", "float col", - "int with nan", "float with nan", "all nan"]] + X = imputer_test_data[ + ["int col", "float col", "int with nan", "float with nan", "all nan"] + ] y = pd.Series([0, 0, 1, 0, 1]) imputer = Imputer(numeric_impute_strategy="median") imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({ - "int col": [0, 1, 2, 0, 3], - "float col": [0.0, 1.0, 0.0, -2.0, 5.], - "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0], - "float with nan": [0.0, 1.0, 0, -1.0, 0.] - }) + expected = pd.DataFrame( + { + "int col": [0, 1, 2, 0, 3], + "float col": [0.0, 1.0, 0.0, -2.0, 5.0], + "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0], + "float with nan": [0.0, 1.0, 0, -1.0, 0.0], + } + ) assert_frame_equal(transformed, expected, check_dtype=False) imputer = Imputer() @@ -94,21 +111,37 @@ def test_numeric_only_input(imputer_test_data): def test_categorical_only_input(imputer_test_data): - X = imputer_test_data[["categorical col", "object col", "bool col", - "categorical with nan", "object with nan", - "bool col with nan", "all nan cat"]] + X = imputer_test_data[ + [ + "categorical col", + "object col", + "bool col", + "categorical with nan", + "object with nan", + "bool col with nan", + "all nan cat", + ] + ] y = pd.Series([0, 0, 1, 0, 1]) imputer = Imputer() imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({ - "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'), - "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'), - "bool col": [True, False, False, True, True], - "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'), - "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype='category'), - "bool col with nan": pd.Series([True, True, False, True, True], dtype='category') - }) + expected = pd.DataFrame( + { + "categorical col": pd.Series( + ["zero", "one", "two", "zero", "three"], dtype="category" + ), + "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"), + "bool col": [True, False, False, True, True], + "categorical with nan": pd.Series( + ["0", "1", "0", "0", "3"], dtype="category" + ), + "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype="category"), + "bool col with nan": pd.Series( + [True, True, False, True, True], dtype="category" + ), + } + ) imputer = Imputer() transformed = imputer.fit_transform(X, y) @@ -121,18 +154,26 @@ def test_categorical_and_numeric_input(imputer_test_data): imputer = Imputer() imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({ - "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'), - "int col": [0, 1, 2, 0, 3], - "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'), - "float col": [0.0, 1.0, 0.0, -2.0, 5.], - "bool col": [True, False, False, True, True], - "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'), - "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0], - "float with nan": [0.0, 1.0, 0, -1.0, 0.], - "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype='category'), - "bool col with nan": pd.Series([True, True, False, True, True], dtype="category") - }) + expected = pd.DataFrame( + { + "categorical col": pd.Series( + ["zero", "one", "two", "zero", "three"], dtype="category" + ), + "int col": [0, 1, 2, 0, 3], + "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"), + "float col": [0.0, 1.0, 0.0, -2.0, 5.0], + "bool col": [True, False, False, True, True], + "categorical with nan": pd.Series( + ["0", "1", "0", "0", "3"], dtype="category" + ), + "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0], + "float with nan": [0.0, 1.0, 0, -1.0, 0.0], + "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype="category"), + "bool col with nan": pd.Series( + [True, True, False, True, True], dtype="category" + ), + } + ) assert_frame_equal(transformed, expected, check_dtype=False) imputer = Imputer() @@ -156,16 +197,12 @@ def test_drop_all_columns(imputer_test_data): def test_typed_imputer_numpy_input(): - X = np.array([[1, 2, 2, 0], - [np.nan, 0, 0, 0], - [1, np.nan, np.nan, np.nan]]) + X = np.array([[1, 2, 2, 0], [np.nan, 0, 0, 0], [1, np.nan, np.nan, np.nan]]) y = pd.Series([0, 0, 1]) imputer = Imputer() imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame(np.array([[1, 2, 2, 0], - [1, 0, 0, 0], - [1, 1, 1, 0]])) + expected = pd.DataFrame(np.array([[1, 2, 2, 0], [1, 0, 0, 0], [1, 1, 1, 0]])) assert_frame_equal(transformed, expected, check_dtype=False) imputer = Imputer() @@ -174,10 +211,14 @@ def test_typed_imputer_numpy_input(): def test_imputer_datetime_input(): - X = pd.DataFrame({'dates': ['20190902', '20200519', '20190607', np.nan], - 'more dates': ['20190902', '20201010', '20190921', np.nan]}) - X['dates'] = pd.to_datetime(X['dates'], format='%Y%m%d') - X['more dates'] = pd.to_datetime(X['more dates'], format='%Y%m%d') + X = pd.DataFrame( + { + "dates": ["20190902", "20200519", "20190607", np.nan], + "more dates": ["20190902", "20201010", "20190921", np.nan], + } + ) + X["dates"] = pd.to_datetime(X["dates"], format="%Y%m%d") + X["more dates"] = pd.to_datetime(X["more dates"], format="%Y%m%d") y = pd.Series() imputer = Imputer() @@ -190,7 +231,7 @@ def test_imputer_datetime_input(): assert_frame_equal(transformed, X, check_dtype=False) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_imputer_empty_data(data_type, make_data_type): X = pd.DataFrame() y = pd.Series() @@ -208,43 +249,77 @@ def test_imputer_empty_data(data_type, make_data_type): def test_imputer_does_not_reset_index(): - X = pd.DataFrame({'input_val': np.arange(10), 'target': np.arange(10), - 'input_cat': ['a'] * 7 + ['b'] * 3}) - X.loc[5, 'input_val'] = np.nan - X.loc[5, 'input_cat'] = np.nan + X = pd.DataFrame( + { + "input_val": np.arange(10), + "target": np.arange(10), + "input_cat": ["a"] * 7 + ["b"] * 3, + } + ) + X.loc[5, "input_val"] = np.nan + X.loc[5, "input_cat"] = np.nan assert X.index.tolist() == list(range(10)) X.drop(0, inplace=True) - y = X.pop('target') + y = X.pop("target") imputer = Imputer() imputer.fit(X, y=y) transformed = imputer.transform(X) - pd.testing.assert_frame_equal(transformed, - pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9], - 'input_cat': pd.Categorical(['a'] * 6 + ['b'] * 3)}, - index=list(range(1, 10)))) + pd.testing.assert_frame_equal( + transformed, + pd.DataFrame( + { + "input_val": [1.0, 2, 3, 4, 5, 6, 7, 8, 9], + "input_cat": pd.Categorical(["a"] * 6 + ["b"] * 3), + }, + index=list(range(1, 10)), + ), + ) def test_imputer_fill_value(imputer_test_data): - X = imputer_test_data[["int with nan", "categorical with nan", - "float with nan", "object with nan", "bool col with nan"]] + X = imputer_test_data[ + [ + "int with nan", + "categorical with nan", + "float with nan", + "object with nan", + "bool col with nan", + ] + ] y = pd.Series([0, 0, 1, 0, 1]) - imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant", - categorical_fill_value="fill", numeric_fill_value=-1) + imputer = Imputer( + categorical_impute_strategy="constant", + numeric_impute_strategy="constant", + categorical_fill_value="fill", + numeric_fill_value=-1, + ) imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({ - "int with nan": [-1, 1, 0, 0, 1], - "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'), - "float with nan": [0.0, 1.0, -1, -1.0, 0.], - "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'), - "bool col with nan": pd.Series([True, "fill", False, "fill", True], dtype='category') - }) + expected = pd.DataFrame( + { + "int with nan": [-1, 1, 0, 0, 1], + "categorical with nan": pd.Series( + ["fill", "1", "fill", "0", "3"], dtype="category" + ), + "float with nan": [0.0, 1.0, -1, -1.0, 0.0], + "object with nan": pd.Series( + ["b", "b", "fill", "c", "fill"], dtype="category" + ), + "bool col with nan": pd.Series( + [True, "fill", False, "fill", True], dtype="category" + ), + } + ) assert_frame_equal(expected, transformed, check_dtype=False) - imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant", - categorical_fill_value="fill", numeric_fill_value=-1) + imputer = Imputer( + categorical_impute_strategy="constant", + numeric_impute_strategy="constant", + categorical_fill_value="fill", + numeric_fill_value=-1, + ) transformed = imputer.fit_transform(X, y) assert_frame_equal(expected, transformed, check_dtype=False) @@ -252,39 +327,59 @@ def test_imputer_fill_value(imputer_test_data): def test_imputer_no_nans(imputer_test_data): X = imputer_test_data[["categorical col", "object col", "bool col"]] y = pd.Series([0, 0, 1, 0, 1]) - imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant", - categorical_fill_value="fill", numeric_fill_value=-1) + imputer = Imputer( + categorical_impute_strategy="constant", + numeric_impute_strategy="constant", + categorical_fill_value="fill", + numeric_fill_value=-1, + ) imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({ - "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'), - "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'), - "bool col": [True, False, False, True, True], - }) + expected = pd.DataFrame( + { + "categorical col": pd.Series( + ["zero", "one", "two", "zero", "three"], dtype="category" + ), + "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"), + "bool col": [True, False, False, True, True], + } + ) assert_frame_equal(transformed, expected, check_dtype=False) - imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant", - categorical_fill_value="fill", numeric_fill_value=-1) + imputer = Imputer( + categorical_impute_strategy="constant", + numeric_impute_strategy="constant", + categorical_fill_value="fill", + numeric_fill_value=-1, + ) transformed = imputer.fit_transform(X, y) assert_frame_equal(transformed, expected, check_dtype=False) def test_imputer_with_none(): - X = pd.DataFrame({"int with None": [1, 0, 5, None], - "float with None": [0.1, 0.0, 0.5, None], - "category with None": pd.Series(["b", "a", "a", None], dtype='category'), - "boolean with None": pd.Series([True, None, False, True]), - "object with None": ["b", "a", "a", None], - "all None": [None, None, None, None]}) + X = pd.DataFrame( + { + "int with None": [1, 0, 5, None], + "float with None": [0.1, 0.0, 0.5, None], + "category with None": pd.Series(["b", "a", "a", None], dtype="category"), + "boolean with None": pd.Series([True, None, False, True]), + "object with None": ["b", "a", "a", None], + "all None": [None, None, None, None], + } + ) y = pd.Series([0, 0, 1, 0, 1]) imputer = Imputer() imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({"int with None": [1, 0, 5, 2], - "float with None": [0.1, 0.0, 0.5, 0.2], - "category with None": pd.Series(["b", "a", "a", "a"], dtype='category'), - "boolean with None": pd.Series([True, True, False, True], dtype='category'), - "object with None": pd.Series(["b", "a", "a", "a"], dtype='category')}) + expected = pd.DataFrame( + { + "int with None": [1, 0, 5, 2], + "float with None": [0.1, 0.0, 0.5, 0.2], + "category with None": pd.Series(["b", "a", "a", "a"], dtype="category"), + "boolean with None": pd.Series([True, True, False, True], dtype="category"), + "object with None": pd.Series(["b", "a", "a", "a"], dtype="category"), + } + ) assert_frame_equal(expected, transformed, check_dtype=False) imputer = Imputer() @@ -292,9 +387,11 @@ def test_imputer_with_none(): assert_frame_equal(expected, transformed, check_dtype=False) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_imputer_all_bool_return_original(data_type, make_data_type): - X = make_data_type(data_type, pd.DataFrame([True, True, False, True, True], dtype=bool)) + X = make_data_type( + data_type, pd.DataFrame([True, True, False, True, True], dtype=bool) + ) X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype=bool) y = make_data_type(data_type, pd.Series([1, 0, 0, 1, 0])) @@ -304,11 +401,11 @@ def test_imputer_all_bool_return_original(data_type, make_data_type): assert_frame_equal(X_expected_arr, X_t) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_imputer_bool_dtype_object(data_type, make_data_type): X = pd.DataFrame([True, np.nan, False, np.nan, True]) y = pd.Series([1, 0, 0, 1, 0]) - X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='category') + X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype="category") X = make_data_type(data_type, X) y = make_data_type(data_type, y) imputer = Imputer() @@ -317,17 +414,23 @@ def test_imputer_bool_dtype_object(data_type, make_data_type): assert_frame_equal(X_expected_arr, X_t) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_imputer_multitype_with_one_bool(data_type, make_data_type): - X_multi = pd.DataFrame({ - "bool with nan": pd.Series([True, np.nan, False, np.nan, False]), - "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), - }) + X_multi = pd.DataFrame( + { + "bool with nan": pd.Series([True, np.nan, False, np.nan, False]), + "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), + } + ) y = pd.Series([1, 0, 0, 1, 0]) - X_multi_expected_arr = pd.DataFrame({ - "bool with nan": pd.Series([True, False, False, False, False], dtype='category'), - "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), - }) + X_multi_expected_arr = pd.DataFrame( + { + "bool with nan": pd.Series( + [True, False, False, False, False], dtype="category" + ), + "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), + } + ) X_multi = make_data_type(data_type, X_multi) y = make_data_type(data_type, y) @@ -342,19 +445,25 @@ def test_imputer_int_preserved(): X = pd.DataFrame(pd.Series([1, 2, 11, np.nan])) imputer = Imputer(numeric_impute_strategy="mean") transformed = imputer.fit_transform(X) - pd.testing.assert_frame_equal(transformed, pd.DataFrame(pd.Series([1, 2, 11, 14 / 3]))) + pd.testing.assert_frame_equal( + transformed, pd.DataFrame(pd.Series([1, 2, 11, 14 / 3])) + ) assert transformed.ww.logical_types == {0: Double} X = pd.DataFrame(pd.Series([1, 2, 3, np.nan])) imputer = Imputer(numeric_impute_strategy="mean") transformed = imputer.fit_transform(X) - pd.testing.assert_frame_equal(transformed, pd.DataFrame(pd.Series([1, 2, 3, 2])), check_dtype=False) + pd.testing.assert_frame_equal( + transformed, pd.DataFrame(pd.Series([1, 2, 3, 2])), check_dtype=False + ) assert transformed.ww.logical_types == {0: Double} - X = pd.DataFrame(pd.Series([1, 2, 3, 4], dtype='int')) + X = pd.DataFrame(pd.Series([1, 2, 3, 4], dtype="int")) imputer = Imputer(numeric_impute_strategy="mean") transformed = imputer.fit_transform(X) - pd.testing.assert_frame_equal(transformed, pd.DataFrame(pd.Series([1, 2, 3, 4])), check_dtype=False) + pd.testing.assert_frame_equal( + transformed, pd.DataFrame(pd.Series([1, 2, 3, 4])), check_dtype=False + ) assert transformed.ww.logical_types == {0: Integer} @@ -362,24 +471,43 @@ def test_imputer_bool_preserved(): X = pd.DataFrame(pd.Series([True, False, True, np.nan])) imputer = Imputer(categorical_impute_strategy="most_frequent") transformed = imputer.fit_transform(X) - pd.testing.assert_frame_equal(transformed, pd.DataFrame(pd.Series([True, False, True, True], dtype="category"))) + pd.testing.assert_frame_equal( + transformed, + pd.DataFrame(pd.Series([True, False, True, True], dtype="category")), + ) assert transformed.ww.logical_types == {0: Categorical} X = pd.DataFrame(pd.Series([True, False, True, False])) imputer = Imputer(categorical_impute_strategy="most_frequent") transformed = imputer.fit_transform(X) - pd.testing.assert_frame_equal(transformed, pd.DataFrame(pd.Series([True, False, True, False])), check_dtype=False) + pd.testing.assert_frame_equal( + transformed, + pd.DataFrame(pd.Series([True, False, True, False])), + check_dtype=False, + ) assert transformed.ww.logical_types == {0: Boolean} -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 4.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype=bool)), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 4.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype=bool)), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) @pytest.mark.parametrize("has_nan", [True, False]) @pytest.mark.parametrize("numeric_impute_strategy", ["mean", "median", "most_frequent"]) -def test_imputer_woodwork_custom_overrides_returned_by_components(X_df, has_nan, numeric_impute_strategy): +def test_imputer_woodwork_custom_overrides_returned_by_components( + X_df, has_nan, numeric_impute_strategy +): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean] for logical_type in override_types: diff --git a/evalml/tests/component_tests/test_knn_classifier.py b/evalml/tests/component_tests/test_knn_classifier.py index 16509cbe00..b148fa68f5 100644 --- a/evalml/tests/component_tests/test_knn_classifier.py +++ b/evalml/tests/component_tests/test_knn_classifier.py @@ -11,9 +11,12 @@ def test_model_family(): def test_problem_types(): - assert set(KNeighborsClassifier.supported_problem_types) == {ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS} + assert set(KNeighborsClassifier.supported_problem_types) == { + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + } def test_fit_predict_binary(X_y_binary): diff --git a/evalml/tests/component_tests/test_lda.py b/evalml/tests/component_tests/test_lda.py index 3d02cca64f..542ec38372 100644 --- a/evalml/tests/component_tests/test_lda.py +++ b/evalml/tests/component_tests/test_lda.py @@ -9,75 +9,88 @@ def test_lda_invalid_init(): - with pytest.raises(ValueError, match="Invalid number of compponents for Linear Discriminant Analysis"): + with pytest.raises( + ValueError, + match="Invalid number of compponents for Linear Discriminant Analysis", + ): LinearDiscriminantAnalysis(n_components=-1) -@pytest.mark.parametrize('data_type', ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_lda_numeric(data_type, make_data_type): - X = pd.DataFrame([[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]]) + X = pd.DataFrame( + [[3, 0, 1, 6], [1, 2, 1, 6], [10, 2, 1, 6], [10, 2, 2, 5], [6, 2, 2, 5]] + ) y = pd.Series([0, 1, 0, 1, 1]) X = make_data_type(data_type, X) y = make_data_type(data_type, y) lda = LinearDiscriminantAnalysis() - expected_X_t = pd.DataFrame([[-3.7498560857993817], - [1.984459921694517], - [-3.234411950294312], - [1.3401547523131798], - [3.659653362085993]], - columns=["component_0"]) + expected_X_t = pd.DataFrame( + [ + [-3.7498560857993817], + [1.984459921694517], + [-3.234411950294312], + [1.3401547523131798], + [3.659653362085993], + ], + columns=["component_0"], + ) X_t = lda.fit_transform(X, y) assert_frame_equal(expected_X_t, X_t) def test_lda_array(): - X = np.array([[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]]) + X = np.array( + [[3, 0, 1, 6], [1, 2, 1, 6], [10, 2, 1, 6], [10, 2, 2, 5], [6, 2, 2, 5]] + ) y = np.array([2, 2, 0, 1, 0]) lda = LinearDiscriminantAnalysis() - expected_X_t = pd.DataFrame([[-0.6412164311777084, 0.5197032695565076], - [0.9499648898073094, -0.6919658287324498], - [0.7364892645407753, 0.884637532109161], - [-0.570057889422197, -0.005831184057363141], - [-0.4751798337481819, -0.7065437888758568]], - columns=[f"component_{i}" for i in range(2)]) + expected_X_t = pd.DataFrame( + [ + [-0.6412164311777084, 0.5197032695565076], + [0.9499648898073094, -0.6919658287324498], + [0.7364892645407753, 0.884637532109161], + [-0.570057889422197, -0.005831184057363141], + [-0.4751798337481819, -0.7065437888758568], + ], + columns=[f"component_{i}" for i in range(2)], + ) lda.fit(X, y) X_t = lda.transform(X) assert_frame_equal(expected_X_t, X_t) def test_lda_invalid(): - X = pd.DataFrame([[3, 0, 1, 6], - [1, None, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, np.nan], - [None, 2, 2, 5]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6], + [1, None, 1, 6], + [10, 2, 1, 6], + [10, 2, 2, np.nan], + [None, 2, 2, 5], + ] + ) y = [2, 0, 1, 1, 0] lda = LinearDiscriminantAnalysis() with pytest.raises(ValueError, match="must be all numeric"): lda.fit(X, y) - X = pd.DataFrame([[3, 0, 1, 6], - ['a', 'b', 'a', 'b'], - [10, 2, 1, 6], - [10, 2, 2, 23], - [0, 2, 2, 5]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6], + ["a", "b", "a", "b"], + [10, 2, 1, 6], + [10, 2, 2, 23], + [0, 2, 2, 5], + ] + ) lda = LinearDiscriminantAnalysis() with pytest.raises(ValueError, match="must be all numeric"): lda.fit_transform(X, y) - X_ok = pd.DataFrame([[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]]) + X_ok = pd.DataFrame( + [[3, 0, 1, 6], [1, 2, 1, 6], [10, 2, 1, 6], [10, 2, 2, 5], [6, 2, 2, 5]] + ) lda = LinearDiscriminantAnalysis() lda.fit(X_ok, y) with pytest.raises(ValueError, match="must be all numeric"): @@ -85,13 +98,17 @@ def test_lda_invalid(): def test_n_components(): - X = pd.DataFrame([[3, 0, 1, 6, 5, 10], - [1, 3, 1, 3, 11, 4], - [10, 2, 3, 12, 5, 6], - [10, 6, 4, 3, 0, 1], - [6, 8, 9, 3, 3, 5], - [3, 2, 1, 2, 1, 3], - [12, 11, 1, 1, 3, 3]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6, 5, 10], + [1, 3, 1, 3, 11, 4], + [10, 2, 3, 12, 5, 6], + [10, 6, 4, 3, 0, 1], + [6, 8, 9, 3, 3, 5], + [3, 2, 1, 2, 1, 3], + [12, 11, 1, 1, 3, 3], + ] + ) y = [0, 3, 3, 1, 2, 0, 2] lda = LinearDiscriminantAnalysis(n_components=3) @@ -104,25 +121,33 @@ def test_n_components(): def test_invalid_n_components(): - X = pd.DataFrame([[3, 0, 1, 6, 5, 10], - [1, 3, 1, 3, 11, 4], - [10, 2, 3, 12, 5, 6], - [10, 6, 4, 3, 0, 1], - [6, 8, 9, 3, 3, 5], - [3, 2, 1, 2, 1, 3], - [12, 11, 1, 1, 3, 3]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6, 5, 10], + [1, 3, 1, 3, 11, 4], + [10, 2, 3, 12, 5, 6], + [10, 6, 4, 3, 0, 1], + [6, 8, 9, 3, 3, 5], + [3, 2, 1, 2, 1, 3], + [12, 11, 1, 1, 3, 3], + ] + ) y = [0, 1, 2, 1, 2, 0, 2] lda_invalid = LinearDiscriminantAnalysis(n_components=4) with pytest.raises(ValueError, match="is too large"): lda_invalid.fit(X, y) - X = pd.DataFrame([[3, 0, 1], - [1, 3, 1], - [10, 2, 3], - [10, 6, 4], - [6, 8, 9], - [3, 2, 1], - [12, 11, 1]]) + X = pd.DataFrame( + [ + [3, 0, 1], + [1, 3, 1], + [10, 2, 3], + [10, 6, 4], + [6, 8, 9], + [3, 2, 1], + [12, 11, 1], + ] + ) y = [0, 1, 2, 3, 4, 3, 4, 5] lda_invalid = LinearDiscriminantAnalysis(n_components=4) with pytest.raises(ValueError, match="is too large"): @@ -130,17 +155,22 @@ def test_invalid_n_components(): def test_lda_woodwork_custom_overrides_returned_by_components(): - X_df = pd.DataFrame([[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]]) + X_df = pd.DataFrame( + [[3, 0, 1, 6], [1, 2, 1, 6], [10, 2, 1, 6], [10, 2, 2, 5], [6, 2, 2, 5]] + ) y = pd.Series([0, 1, 0, 1, 1]) override_types = [Integer, Double] for logical_type in override_types: - X_df.ww.init(logical_types={0: logical_type, 1: logical_type, 2: logical_type, 3: logical_type}) + X_df.ww.init( + logical_types={ + 0: logical_type, + 1: logical_type, + 2: logical_type, + 3: logical_type, + } + ) lda = LinearDiscriminantAnalysis(n_components=1) lda.fit(X_df, y) transformed = lda.transform(X_df, y) assert isinstance(transformed, pd.DataFrame) - assert transformed.ww.logical_types == {'component_0': ww.logical_types.Double} + assert transformed.ww.logical_types == {"component_0": ww.logical_types.Double} diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index befff337b5..1766024f0a 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -11,7 +11,7 @@ from evalml.problem_types import ProblemTypes from evalml.utils import SEED_BOUNDS -lgbm = importorskip('lightgbm', reason='Skipping test because lightgbm not installed') +lgbm = importorskip("lightgbm", reason="Skipping test because lightgbm not installed") def test_model_family(): @@ -19,9 +19,12 @@ def test_model_family(): def test_problem_types(): - assert set(LightGBMClassifier.supported_problem_types) == {ProblemTypes.MULTICLASS, ProblemTypes.BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY} + assert set(LightGBMClassifier.supported_problem_types) == { + ProblemTypes.MULTICLASS, + ProblemTypes.BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + } def test_lightgbm_classifier_random_seed_bounds_seed(X_y_binary): @@ -30,10 +33,14 @@ def test_lightgbm_classifier_random_seed_bounds_seed(X_y_binary): col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) y = pd.Series(y) - clf = LightGBMClassifier(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound, n_jobs=1) + clf = LightGBMClassifier( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound, n_jobs=1 + ) fitted = clf.fit(X, y) assert isinstance(fitted, LightGBMClassifier) - clf = LightGBMClassifier(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound, n_jobs=1) + clf = LightGBMClassifier( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound, n_jobs=1 + ) clf.fit(X, y) @@ -88,14 +95,14 @@ def test_feature_importance(X_y_binary): def test_fit_string_features(X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) - X['string_col'] = 'abc' + X["string_col"] = "abc" # lightGBM requires input args to be int, float, or bool, not string X_expected = X.copy() - X_expected['string_col'] = 0.0 + X_expected["string_col"] = 0.0 clf = lgbm.sklearn.LGBMClassifier(random_state=0, n_jobs=1) - clf.fit(X_expected, y, categorical_feature=['string_col']) + clf.fit(X_expected, y, categorical_feature=["string_col"]) y_pred_sk = clf.predict(X_expected) y_pred_proba_sk = clf.predict_proba(X_expected) @@ -108,8 +115,8 @@ def test_fit_string_features(X_y_binary): np.testing.assert_almost_equal(y_pred_proba_sk, y_pred_proba.values, decimal=5) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict_proba') -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba") +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_fit_no_categories(mock_predict, mock_predict_proba, X_y_binary): X, y = X_y_binary X2 = pd.DataFrame(X) @@ -126,24 +133,26 @@ def test_fit_no_categories(mock_predict, mock_predict_proba, X_y_binary): np.testing.assert_array_equal(arg_X, X2) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict_proba') -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba") +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_correct_args(mock_predict, mock_predict_proba, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) # add object (string) and categorical data. - X['string_col'] = 'abc' - X['string_col'].iloc[len(X) // 2:] = 'cba' - X['categorical_data'] = 'square' - X['categorical_data'].iloc[len(X) // 2:] = 'circle' - X['categorical_data'] = X['categorical_data'].astype('category') + X["string_col"] = "abc" + X["string_col"].iloc[len(X) // 2 :] = "cba" + X["categorical_data"] = "square" + X["categorical_data"].iloc[len(X) // 2 :] = "circle" + X["categorical_data"] = X["categorical_data"].astype("category") # create the expected result, which is a dataframe with int values in the categorical column and dtype=category X_expected = X.copy() X_expected = X_expected.replace(["abc", "cba"], [0.0, 1.0]) X_expected = X_expected.replace(["square", "circle"], [1.0, 0.0]) - X_expected[['string_col', 'categorical_data']] = X_expected[['string_col', 'categorical_data']].astype('category') + X_expected[["string_col", "categorical_data"]] = X_expected[ + ["string_col", "categorical_data"] + ].astype("category") # rename the columns to be the indices X_expected.columns = np.arange(X_expected.shape[1]) @@ -160,17 +169,21 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary): assert_frame_equal(X_expected, arg_X) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict_proba') -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba") +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): - X = pd.DataFrame({"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]}) + X = pd.DataFrame( + {"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]} + ) y = pd.Series([1, 1, 0, 0, 0, 1]) - X_expected = pd.DataFrame({0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}) - X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype('category') + X_expected = pd.DataFrame( + {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]} + ) + X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category") X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]}) X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]}) - X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype('category') + X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype("category") clf = LightGBMClassifier() clf.fit(X, y) @@ -185,13 +198,13 @@ def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): assert_frame_equal(X_expected_subset, arg_X) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict_proba') -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba") +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_multiple_fit(mock_predict, mock_predict_proba): y = pd.Series([1] * 4) X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]}) X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]}) - X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype='category') + X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype="category") clf = LightGBMClassifier() clf.fit(X1_fit, y) @@ -203,7 +216,7 @@ def test_multiple_fit(mock_predict, mock_predict_proba): # Check if it will fit a different dataset with new variable X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]}) X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]}) - X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype='category') + X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype="category") clf = LightGBMClassifier() clf.fit(X2_fit, y) @@ -213,21 +226,23 @@ def test_multiple_fit(mock_predict, mock_predict_proba): assert_frame_equal(X2_predict_expected, mock_predict_proba.call_args[0][0]) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_multiclass_label(mock_predict, X_y_multi): X, y = X_y_multi - y_numeric = pd.Series(y, dtype='int64') - y_alpha = pd.Series(y_numeric.copy().replace({0: "alright", 1: "better", 2: "great"})) + y_numeric = pd.Series(y, dtype="int64") + y_alpha = pd.Series( + y_numeric.copy().replace({0: "alright", 1: "better", 2: "great"}) + ) clf = LightGBMClassifier() clf.fit(X, y_alpha) clf.predict(X) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_binary_label_encoding(mock_predict, X_y_binary): X, y = X_y_binary - y_numeric = pd.Series(y, dtype='int64') + y_numeric = pd.Series(y, dtype="int64") y_alpha = pd.Series(y_numeric.copy().replace({0: "no", 1: "yes"})) clf = LightGBMClassifier() clf.fit(X, y_alpha) @@ -242,30 +257,37 @@ def test_binary_rf(X_y_binary): X, y = X_y_binary with pytest.raises(lgbm.basic.LightGBMError, match="bagging_fraction"): - clf = LightGBMClassifier(boosting_type="rf", bagging_freq=1, bagging_fraction=1.01) + clf = LightGBMClassifier( + boosting_type="rf", bagging_freq=1, bagging_fraction=1.01 + ) clf.fit(X, y) clf = LightGBMClassifier(boosting_type="rf", bagging_freq=0, n_jobs=1) clf.fit(X, y) - assert clf.parameters['bagging_freq'] == 0 - assert clf.parameters['bagging_fraction'] == 0.9 + assert clf.parameters["bagging_freq"] == 0 + assert clf.parameters["bagging_fraction"] == 0.9 def test_binary_goss(X_y_binary): X, y = X_y_binary clf = LightGBMClassifier(boosting_type="goss") clf.fit(X, y) - assert clf.parameters['bagging_freq'] == 0 - assert clf.parameters['bagging_fraction'] == 0.9 + assert clf.parameters["bagging_freq"] == 0 + assert clf.parameters["bagging_fraction"] == 0.9 -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_lightgbm_multiindex(data_type, X_y_binary, make_data_type): X, y = X_y_binary X = pd.DataFrame(X) - categorical_col = pd.Series([1] * int(len(X[0]) / 2) + [0] * int(len(X[0]) - len(X[0]) / 2), dtype='category') - X['cat'] = categorical_col - col_names = [('column_{}'.format(num), '{}'.format(num)) for num in range(len(X.columns))] + categorical_col = pd.Series( + [1] * int(len(X[0]) / 2) + [0] * int(len(X[0]) - len(X[0]) / 2), + dtype="category", + ) + X["cat"] = categorical_col + col_names = [ + ("column_{}".format(num), "{}".format(num)) for num in range(len(X.columns)) + ] X.columns = pd.MultiIndex.from_tuples(col_names) X = make_data_type(data_type, X) y = make_data_type(data_type, y) diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py index 4d31a534a1..1b20b23bd4 100644 --- a/evalml/tests/component_tests/test_lgbm_regressor.py +++ b/evalml/tests/component_tests/test_lgbm_regressor.py @@ -11,7 +11,7 @@ from evalml.problem_types import ProblemTypes from evalml.utils import SEED_BOUNDS -lgbm = importorskip('lightgbm', reason='Skipping test because lightgbm not installed') +lgbm = importorskip("lightgbm", reason="Skipping test because lightgbm not installed") def test_model_family(): @@ -28,10 +28,14 @@ def test_lightgbm_regressor_random_seed_bounds_seed(X_y_regression): col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) y = pd.Series(y) - clf = LightGBMRegressor(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound) + clf = LightGBMRegressor( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound + ) fitted = clf.fit(X, y) assert isinstance(fitted, LightGBMRegressor) - clf = LightGBMRegressor(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound) + clf = LightGBMRegressor( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound + ) clf.fit(X, y) @@ -66,14 +70,14 @@ def test_feature_importance(X_y_regression): def test_fit_string_features(X_y_regression): X, y = X_y_regression X = pd.DataFrame(X) - X['string_col'] = 'abc' + X["string_col"] = "abc" # lightGBM requires input args to be int, float, or bool, not string X_expected = X.copy() - X_expected['string_col'] = 0.0 + X_expected["string_col"] = 0.0 clf = lgbm.sklearn.LGBMRegressor(n_estimators=20, random_state=0) - clf.fit(X_expected, y, categorical_feature=['string_col']) + clf.fit(X_expected, y, categorical_feature=["string_col"]) y_pred_sk = clf.predict(X_expected) clf = LightGBMRegressor() @@ -83,23 +87,25 @@ def test_fit_string_features(X_y_regression): np.testing.assert_almost_equal(y_pred_sk, y_pred.values, decimal=5) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_correct_args(mock_predict, X_y_regression): X, y = X_y_regression X = pd.DataFrame(X) # add object (string) and categorical data. - X['string_col'] = 'abc' - X['string_col'].iloc[len(X) // 2:] = 'cba' - X['categorical_data'] = 'square' - X['categorical_data'].iloc[len(X) // 2:] = 'circle' - X['categorical_data'] = X['categorical_data'].astype('category') + X["string_col"] = "abc" + X["string_col"].iloc[len(X) // 2 :] = "cba" + X["categorical_data"] = "square" + X["categorical_data"].iloc[len(X) // 2 :] = "circle" + X["categorical_data"] = X["categorical_data"].astype("category") # create the expected result, which is a dataframe with int values in the categorical column and dtype=category X_expected = X.copy() X_expected = X_expected.replace(["abc", "cba"], [0.0, 1.0]) X_expected = X_expected.replace(["square", "circle"], [1.0, 0.0]) - X_expected[['string_col', 'categorical_data']] = X_expected[['string_col', 'categorical_data']].astype('category') + X_expected[["string_col", "categorical_data"]] = X_expected[ + ["string_col", "categorical_data"] + ].astype("category") # rename the columns to be the indices X_expected.columns = np.arange(X_expected.shape[1]) @@ -112,16 +118,20 @@ def test_correct_args(mock_predict, X_y_regression): assert_frame_equal(X_expected, arg_X) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_categorical_data_subset(mock_predict, X_y_regression): - X = pd.DataFrame({"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]}) + X = pd.DataFrame( + {"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]} + ) y = pd.Series([1, 1, 0, 0, 0, 1]) - X_expected = pd.DataFrame({0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}) - X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype('category') + X_expected = pd.DataFrame( + {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]} + ) + X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category") X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]}) X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]}) - X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype('category') + X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype("category") clf = LightGBMRegressor() clf.fit(X, y) @@ -132,12 +142,12 @@ def test_categorical_data_subset(mock_predict, X_y_regression): assert_frame_equal(X_expected_subset, arg_X) -@patch('evalml.pipelines.components.estimators.estimator.Estimator.predict') +@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_multiple_fit(mock_predict): y = pd.Series([1] * 4) X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]}) X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]}) - X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype='category') + X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype="category") clf = LightGBMRegressor() clf.fit(X1_fit, y) @@ -147,7 +157,7 @@ def test_multiple_fit(mock_predict): # Check if it will fit a different dataset with new variable X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]}) X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]}) - X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype='category') + X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype="category") clf = LightGBMRegressor() clf.fit(X2_fit, y) @@ -159,28 +169,32 @@ def test_regression_rf(X_y_regression): X, y = X_y_regression with pytest.raises(lgbm.basic.LightGBMError, match="bagging_fraction"): - clf = LightGBMRegressor(boosting_type="rf", bagging_freq=1, bagging_fraction=1.01) + clf = LightGBMRegressor( + boosting_type="rf", bagging_freq=1, bagging_fraction=1.01 + ) clf.fit(X, y) clf = LightGBMRegressor(boosting_type="rf", bagging_freq=0) clf.fit(X, y) - assert clf.parameters['bagging_freq'] == 0 - assert clf.parameters['bagging_fraction'] == 0.9 + assert clf.parameters["bagging_freq"] == 0 + assert clf.parameters["bagging_fraction"] == 0.9 def test_regression_goss(X_y_regression): X, y = X_y_regression clf = LightGBMRegressor(boosting_type="goss") clf.fit(X, y) - assert clf.parameters['bagging_freq'] == 0 - assert clf.parameters['bagging_fraction'] == 0.9 + assert clf.parameters["bagging_freq"] == 0 + assert clf.parameters["bagging_fraction"] == 0.9 -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_lightgbm_multiindex(data_type, X_y_regression, make_data_type): X, y = X_y_regression X = pd.DataFrame(X) - col_names = [('column_{}'.format(num), '{}'.format(num)) for num in range(len(X.columns))] + col_names = [ + ("column_{}".format(num), "{}".format(num)) for num in range(len(X.columns)) + ] X.columns = pd.MultiIndex.from_tuples(col_names) X = make_data_type(data_type, X) y = make_data_type(data_type, y) diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py index c5803ee87e..dd6f7a42df 100644 --- a/evalml/tests/component_tests/test_lsa.py +++ b/evalml/tests/component_tests/test_lsa.py @@ -8,7 +8,7 @@ Categorical, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.pipelines.components import LSA @@ -20,10 +20,9 @@ def test_lsa_only_text(text_df): lsa = LSA() lsa.fit(X) - expected_col_names = set(['LSA(col_1)[0]', - 'LSA(col_1)[1]', - 'LSA(col_2)[0]', - 'LSA(col_2)[1]']) + expected_col_names = set( + ["LSA(col_1)[0]", "LSA(col_1)[1]", "LSA(col_2)[0]", "LSA(col_2)[1]"] + ) X_t = lsa.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 4 @@ -32,15 +31,13 @@ def test_lsa_only_text(text_df): def test_lsa_with_nontext(text_df): X = text_df - X['col_3'] = [73.7, 67.213, 92] + X["col_3"] = [73.7, 67.213, 92] lsa = LSA() lsa.fit(X) - expected_col_names = set(['LSA(col_1)[0]', - 'LSA(col_1)[1]', - 'LSA(col_2)[0]', - 'LSA(col_2)[1]', - 'col_3']) + expected_col_names = set( + ["LSA(col_1)[0]", "LSA(col_1)[1]", "LSA(col_2)[0]", "LSA(col_2)[1]", "col_3"] + ) X_t = lsa.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 5 @@ -48,7 +45,7 @@ def test_lsa_with_nontext(text_df): def test_lsa_no_text(): - X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) + X = pd.DataFrame({"col_1": [1, 2, 3], "col_2": [4, 5, 6]}) lsa = LSA() lsa.fit(X) X_t = lsa.transform(X) @@ -57,11 +54,10 @@ def test_lsa_no_text(): def test_some_missing_col_names(text_df, caplog): X = text_df - expected_col_names = set(['LSA(col_1)[0]', - 'LSA(col_1)[1]', - 'LSA(col_2)[0]', - 'LSA(col_2)[1]']) - lsa = LSA(text_columns=['col_1', 'col_2', 'col_3']) + expected_col_names = set( + ["LSA(col_1)[0]", "LSA(col_1)[1]", "LSA(col_2)[0]", "LSA(col_2)[1]"] + ) + lsa = LSA(text_columns=["col_1", "col_2", "col_3"]) lsa.fit(X) X_t = lsa.transform(X) assert set(X_t.columns) == expected_col_names @@ -70,10 +66,13 @@ def test_some_missing_col_names(text_df, caplog): def test_lsa_empty_text_column(): - X = pd.DataFrame({'col_1': []}) - X = infer_feature_types(X, {'col_1': 'NaturalLanguage'}) + X = pd.DataFrame({"col_1": []}) + X = infer_feature_types(X, {"col_1": "NaturalLanguage"}) lsa = LSA() - with pytest.raises(ValueError, match="empty vocabulary; perhaps the documents only contain stop words"): + with pytest.raises( + ValueError, + match="empty vocabulary; perhaps the documents only contain stop words", + ): lsa.fit(X) @@ -81,29 +80,43 @@ def test_lsa_text_column_with_nonstring_values(): # we assume this sort of data would fail to validate as text data up the stack # but just in case, make sure our component will convert non-str values to str X = pd.DataFrame( - {'col_1': [ - 'I\'m singing in the rain!$%^ do do do do do da do', - 'just singing in the rain.................. \n', - 325, - np.nan, - None, - 'I\'m happy again!!! lalalalalalalalalalala']}) - X = infer_feature_types(X, {'col_1': 'NaturalLanguage'}) + { + "col_1": [ + "I'm singing in the rain!$%^ do do do do do da do", + "just singing in the rain.................. \n", + 325, + np.nan, + None, + "I'm happy again!!! lalalalalalalalalalala", + ] + } + ) + X = infer_feature_types(X, {"col_1": "NaturalLanguage"}) lsa = LSA() lsa.fit(X) def test_index_col_names(): - X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'], - ['just singing in the rain.................. \n', 'singing the songs of angry men\n'], - ['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']]) + X = np.array( + [ + [ + "I'm singing in the rain!$%^ do do do do do da do", + "do you hear the people sing?////////////////////////////////////", + ], + [ + "just singing in the rain.................. \n", + "singing the songs of angry men\n", + ], + [ + "\t\n\n\n\nWhat a glorious feelinggggggggggg, I'm happy again!!! lalalalalalalalalalala", + "\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!", + ], + ] + ) lsa = LSA() lsa.fit(X) - expected_col_names = set(['LSA(0)[0]', - 'LSA(0)[1]', - 'LSA(1)[0]', - 'LSA(1)[1]']) + expected_col_names = set(["LSA(0)[0]", "LSA(0)[1]", "LSA(1)[0]", "LSA(1)[1]"]) X_t = lsa.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 4 @@ -112,19 +125,24 @@ def test_index_col_names(): def test_float_col_names(): X = pd.DataFrame( - {4.75: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], - -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living', - 'Red, the blood of angry men - black, the dark of ages past'] - }) + { + 4.75: [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + ], + -1: [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living", + "Red, the blood of angry men - black, the dark of ages past", + ], + } + ) lsa = LSA() lsa.fit(X) - expected_col_names = set(['LSA(4.75)[0]', - 'LSA(4.75)[1]', - 'LSA(-1.0)[0]', - 'LSA(-1.0)[1]']) + expected_col_names = set( + ["LSA(4.75)[0]", "LSA(4.75)[1]", "LSA(-1.0)[0]", "LSA(-1.0)[1]"] + ) X_t = lsa.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 4 @@ -133,16 +151,21 @@ def test_float_col_names(): def test_lsa_output(): X = pd.DataFrame( - {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living', - 'Red, the blood of angry men - black, the dark of ages past']}) + { + "lsa": [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living", + "Red, the blood of angry men - black, the dark of ages past", + ] + } + ) lsa = LSA() lsa.fit(X) - expected_features = pd.DataFrame([[0.832, 0.], - [0., 1.], - [0.832, 0.]], columns=["LSA(lsa)[0]", "LSA(lsa)[1]"]) + expected_features = pd.DataFrame( + [[0.832, 0.0], [0.0, 1.0], [0.832, 0.0]], columns=["LSA(lsa)[0]", "LSA(lsa)[1]"] + ) X_t = lsa.transform(X) - cols = [col for col in X_t.columns if 'LSA' in col] + cols = [col for col in X_t.columns if "LSA" in col] features = X_t[cols] assert_frame_equal(expected_features, features, atol=1e-3) @@ -150,22 +173,37 @@ def test_lsa_output(): def test_lsa_with_custom_indices(text_df): X = text_df X = X.set_index(pd.Series([2, 5, 19])) - lsa = LSA(text_columns=['col_1', 'col_2']) + lsa = LSA(text_columns=["col_1", "col_2"]) lsa.fit(X) X_t = lsa.transform(X) assert not X_t.isnull().any().any() -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.to_datetime(['20190902', '20200519', '20190607'], format='%Y%m%d')), - pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame( + pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d") + ), + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) def test_lsa_woodwork_custom_overrides_returned_by_components(X_df): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, Boolean, NaturalLanguage] - X_df['text col'] = pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string") + X_df["text col"] = pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) lsa = LSA() for logical_type in override_types: try: @@ -178,6 +216,15 @@ def test_lsa_woodwork_custom_overrides_returned_by_components(X_df): transformed = lsa.transform(X, y) assert isinstance(transformed, pd.DataFrame) if logical_type == NaturalLanguage: - assert transformed.ww.logical_types == {'LSA(0)[0]': Double, 'LSA(0)[1]': Double, 'LSA(text col)[0]': Double, 'LSA(text col)[1]': Double} + assert transformed.ww.logical_types == { + "LSA(0)[0]": Double, + "LSA(0)[1]": Double, + "LSA(text col)[0]": Double, + "LSA(text col)[1]": Double, + } else: - assert transformed.ww.logical_types == {0: logical_type, 'LSA(text col)[0]': Double, 'LSA(text col)[1]': Double} + assert transformed.ww.logical_types == { + 0: logical_type, + "LSA(text col)[0]": Double, + "LSA(text col)[1]": Double, + } diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 9abb9f4149..93adec80b8 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -9,7 +9,7 @@ Datetime, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.exceptions import ComponentNotYetFittedError @@ -18,12 +18,14 @@ def test_init(): - parameters = {'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error'} + parameters = { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", + } encoder = OneHotEncoder() assert encoder.parameters == parameters @@ -31,12 +33,12 @@ def test_init(): def test_parameters(): encoder = OneHotEncoder(top_n=123) expected_parameters = { - 'top_n': 123, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "top_n": 123, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", } assert encoder.parameters == expected_parameters @@ -50,9 +52,13 @@ def test_invalid_inputs(): with pytest.raises(ValueError, match=error_msg): encoder = OneHotEncoder(handle_unknown="bananas") - X = pd.DataFrame({'col_1': ["a", "b", "c", "d", "a"], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) encoder = OneHotEncoder(top_n=None, categories=[["a", "b"], ["a", "c"]]) error_msg = "Categories argument must contain a list of categories for each categorical feature" with pytest.raises(ValueError, match=error_msg): @@ -65,91 +71,134 @@ def test_invalid_inputs(): def test_null_values_in_dataframe(): - X = pd.DataFrame({'col_1': ["a", "b", "c", "d", np.nan], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", np.nan], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) # Test NaN will be counted as a category if within the top_n - encoder = OneHotEncoder(handle_missing='as_category') + encoder = OneHotEncoder(handle_missing="as_category") encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set(["col_1_a", "col_1_b", "col_1_c", "col_1_d", "col_1_nan", - "col_2_a", "col_2_b", "col_2_c", "col_3_a"]) + expected_col_names = set( + [ + "col_1_a", + "col_1_b", + "col_1_c", + "col_1_d", + "col_1_nan", + "col_2_a", + "col_2_b", + "col_2_c", + "col_3_a", + ] + ) col_names = set(X_t.columns) - assert (col_names == expected_col_names) + assert col_names == expected_col_names assert X_t.shape == (5, 9) # Test NaN will not be counted as a category if not in the top_n - X = pd.DataFrame({'col_1': ["a", "a", "c", "c", np.nan], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"], - 'col_4': [2, 0, 1, np.nan, 0]}) - - encoder = OneHotEncoder(top_n=2, handle_missing='as_category') + X = pd.DataFrame( + { + "col_1": ["a", "a", "c", "c", np.nan], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [2, 0, 1, np.nan, 0], + } + ) + + encoder = OneHotEncoder(top_n=2, handle_missing="as_category") encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set(["col_1_a", "col_1_c", "col_2_a", "col_2_b", "col_3_a", "col_4"]) + expected_col_names = set( + ["col_1_a", "col_1_c", "col_2_a", "col_2_b", "col_3_a", "col_4"] + ) col_names = set(X_t.columns) - assert (col_names == expected_col_names) + assert col_names == expected_col_names assert X_t.shape == (5, 6) # Test handle_missing='error' throws an error - encoder = OneHotEncoder(handle_missing='error') + encoder = OneHotEncoder(handle_missing="error") - X = pd.DataFrame({"col_1": [np.nan, "b", "c", "d", "e", "f", "g"], - "col_2": ["a", "c", "d", "b", "e", "e", "f"], - "col_3": ["a", "a", "a", "a", "a", "a", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2]}) + X = pd.DataFrame( + { + "col_1": [np.nan, "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + } + ) with pytest.raises(ValueError, match="Input contains NaN"): encoder.fit(X) # Test NaN values in transformed data - X = pd.DataFrame({'col_1': ["a", "b", "c", "d", "d"], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) - encoder = OneHotEncoder(handle_missing='error') + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "d"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) + encoder = OneHotEncoder(handle_missing="error") encoder.fit(X) - X_missing = pd.DataFrame({'col_1': ["a", "b", "c", "d", "d"], - 'col_2': ["a", "b", np.nan, "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) + X_missing = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "d"], + "col_2": ["a", "b", np.nan, "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) with pytest.raises(ValueError, match="Input contains NaN"): encoder.transform(X_missing) def test_drop_first(): - X = pd.DataFrame({'col_1': ["a", "b", "c", "d", "d"], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) - encoder = OneHotEncoder(top_n=None, drop='first', handle_unknown='error') + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "d"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) + encoder = OneHotEncoder(top_n=None, drop="first", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) col_names = set(X_t.columns) - expected_col_names = set(["col_1_b", "col_1_c", "col_1_d", - "col_2_b", "col_2_c"]) + expected_col_names = set(["col_1_b", "col_1_c", "col_1_d", "col_2_b", "col_2_c"]) assert col_names == expected_col_names def test_drop_binary(): - X = pd.DataFrame({'col_1': ["a", "b", "b", "a", "b"], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) - encoder = OneHotEncoder(top_n=None, drop='if_binary', handle_unknown='error') + X = pd.DataFrame( + { + "col_1": ["a", "b", "b", "a", "b"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) + encoder = OneHotEncoder(top_n=None, drop="if_binary", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) col_names = set(X_t.columns) - expected_col_names = set(["col_1_a", "col_2_a", - "col_2_b", "col_2_c", "col_3_a"]) + expected_col_names = set(["col_1_a", "col_2_a", "col_2_b", "col_2_c", "col_3_a"]) assert col_names == expected_col_names def test_drop_parameter_is_array(): - X = pd.DataFrame({'col_1': ["a", "b", "b", "a", "b"], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) - encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown='error') + X = pd.DataFrame( + { + "col_1": ["a", "b", "b", "a", "b"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) + encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) col_names = set(X_t.columns) @@ -160,10 +209,14 @@ def test_drop_parameter_is_array(): def test_drop_binary_and_top_n_2(): # Test that columns that originally had two values have one column dropped, # but columns that end up with two values keep both values - X = pd.DataFrame({'col_1': ["a", "b", "b", "a", "b"], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) - encoder = OneHotEncoder(top_n=2, drop='if_binary') + X = pd.DataFrame( + { + "col_1": ["a", "b", "b", "a", "b"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) + encoder = OneHotEncoder(top_n=2, drop="if_binary") encoder.fit(X) X_t = encoder.transform(X) col_names = set(X_t.columns) @@ -172,19 +225,27 @@ def test_drop_binary_and_top_n_2(): def test_handle_unknown(): - X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g"], - "col_2": ["a", "c", "d", "b", "e", "e", "f"], - "col_3": ["a", "a", "a", "a", "a", "a", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2]}) - - encoder = OneHotEncoder(handle_unknown='error') + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + } + ) + + encoder = OneHotEncoder(handle_unknown="error") encoder.fit(X) assert isinstance(encoder.transform(X), pd.DataFrame) - X = pd.DataFrame({"col_1": ["x", "b", "c", "d", "e", "f", "g"], - "col_2": ["a", "c", "d", "b", "e", "e", "f"], - "col_3": ["a", "a", "a", "a", "a", "a", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2]}) + X = pd.DataFrame( + { + "col_1": ["x", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + } + ) with pytest.raises(ValueError) as exec_info: encoder.transform(X) assert "Found unknown categories" in exec_info.value.args[0] @@ -192,10 +253,14 @@ def test_handle_unknown(): def test_no_top_n(): # test all categories in all columns are encoded when top_n is None - X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], - "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"], - "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"], + "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2], + } + ) expected_col_names = set(["col_3_b", "col_4"]) for val in X["col_1"]: expected_col_names.add("col_1_" + val) @@ -207,14 +272,18 @@ def test_no_top_n(): X_t = encoder.transform(X) col_names = set(X_t.columns) - assert (X_t.shape == (11, 19)) - assert (col_names == expected_col_names) + assert X_t.shape == (11, 19) + assert col_names == expected_col_names # Make sure unknown values cause an error - X_new = pd.DataFrame({"col_1": ["a", "b", "c", "x"], - "col_2": ["a", "c", "d", "b"], - "col_3": ["a", "a", "a", "a"], - "col_4": [2, 0, 1, 3]}) + X_new = pd.DataFrame( + { + "col_1": ["a", "b", "c", "x"], + "col_2": ["a", "c", "d", "b"], + "col_3": ["a", "a", "a", "a"], + "col_4": [2, 0, 1, 3], + } + ) with pytest.raises(ValueError) as exec_info: encoder.transform(X_new) @@ -222,14 +291,16 @@ def test_no_top_n(): def test_categories(): - X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g"], - "col_2": ["a", "c", "d", "b", "e", "e", "f"], - "col_3": ["a", "a", "a", "a", "a", "a", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + } + ) - categories = [["a", "b", "c", "d"], - ["a", "b", "c"], - ["a", "b"]] + categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] # test categories value works encoder = OneHotEncoder(top_n=None, categories=categories, random_seed=2) @@ -237,39 +308,71 @@ def test_categories(): X_t = encoder.transform(X) col_names = set(X_t.columns) - expected_col_names = set(["col_1_a", "col_1_b", "col_1_c", "col_1_d", - "col_2_a", "col_2_b", "col_2_c", "col_3_a", - "col_3_b", "col_4"]) - assert (X_t.shape == (7, 10)) - assert (col_names == expected_col_names) + expected_col_names = set( + [ + "col_1_a", + "col_1_b", + "col_1_c", + "col_1_d", + "col_2_a", + "col_2_b", + "col_2_c", + "col_3_a", + "col_3_b", + "col_4", + ] + ) + assert X_t.shape == (7, 10) + assert col_names == expected_col_names # test categories with top_n errors - with pytest.raises(ValueError, match="Cannot use categories and top_n arguments simultaneously"): + with pytest.raises( + ValueError, match="Cannot use categories and top_n arguments simultaneously" + ): encoder = OneHotEncoder(top_n=10, categories=categories, random_seed=2) def test_less_than_top_n_unique_values(): # test that columns with less than n unique values encodes properly - X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "a"], - "col_2": ["a", "b", "a", "c", "b"], - "col_3": ["a", "a", "a", "a", "a"], - "col_4": [2, 0, 1, 0, 0]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [2, 0, 1, 0, 0], + } + ) encoder = OneHotEncoder(top_n=5) encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set(["col_1_a", "col_1_b", "col_1_c", "col_1_d", - "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_4"]) + expected_col_names = set( + [ + "col_1_a", + "col_1_b", + "col_1_c", + "col_1_d", + "col_2_a", + "col_2_b", + "col_2_c", + "col_3_a", + "col_4", + ] + ) col_names = set(X_t.columns) - assert (col_names == expected_col_names) + assert col_names == expected_col_names def test_more_top_n_unique_values(): # test that columns with >= n unique values encodes properly - X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g"], - "col_2": ["a", "c", "d", "b", "e", "e", "f"], - "col_3": ["a", "a", "a", "a", "a", "a", "b"], - "col_4": [2, 0, 1, 3, 0, 1, 2]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g"], + "col_2": ["a", "c", "d", "b", "e", "e", "f"], + "col_3": ["a", "a", "a", "a", "a", "a", "b"], + "col_4": [2, 0, 1, 3, 0, 1, 2], + } + ) random_seed = 2 @@ -281,13 +384,17 @@ def test_more_top_n_unique_values(): X = infer_feature_types(X) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) - col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') - col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist() + col_1_counts = col_1_counts.sort_values( + ["col_1"], ascending=False, kind="mergesort" + ) + col_1_samples = col_1_counts.head(encoder.parameters["top_n"]).index.tolist() col_2_counts = X["col_2"].value_counts(dropna=False).to_frame() col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed) - col_2_counts = col_2_counts.sort_values(["col_2"], ascending=False, kind='mergesort') - col_2_samples = col_2_counts.head(encoder.parameters['top_n']).index.tolist() + col_2_counts = col_2_counts.sort_values( + ["col_2"], ascending=False, kind="mergesort" + ) + col_2_samples = col_2_counts.head(encoder.parameters["top_n"]).index.tolist() expected_col_names = set(["col_2_e", "col_3_b", "col_4"]) for val in col_1_samples: @@ -296,14 +403,18 @@ def test_more_top_n_unique_values(): expected_col_names.add("col_2_" + val) col_names = set(X_t.columns) - assert (col_names == expected_col_names) + assert col_names == expected_col_names def test_more_top_n_unique_values_large(): - X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], - "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"], - "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"], - "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], + "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"], + "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"], + "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1], + } + ) random_seed = 2 @@ -315,42 +426,67 @@ def test_more_top_n_unique_values_large(): X = infer_feature_types(X) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) - col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') - col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist() - expected_col_names = set(["col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4"]) + col_1_counts = col_1_counts.sort_values( + ["col_1"], ascending=False, kind="mergesort" + ) + col_1_samples = col_1_counts.head(encoder.parameters["top_n"]).index.tolist() + expected_col_names = set( + ["col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4"] + ) for val in col_1_samples: expected_col_names.add("col_1_" + val) col_names = set(X_t.columns) - assert (col_names == expected_col_names) + assert col_names == expected_col_names def test_categorical_dtype(): # test that columns with the categorical type are encoded properly - X = pd.DataFrame({"col_1": ["f", "b", "c", "d", "e"], - "col_2": ["a", "e", "d", "d", "e"], - "col_3": ["a", "a", "a", "a", "a"], - "col_4": [3, 3, 2, 2, 1]}) - X["col_4"] = X["col_4"].astype('category') + X = pd.DataFrame( + { + "col_1": ["f", "b", "c", "d", "e"], + "col_2": ["a", "e", "d", "d", "e"], + "col_3": ["a", "a", "a", "a", "a"], + "col_4": [3, 3, 2, 2, 1], + } + ) + X["col_4"] = X["col_4"].astype("category") encoder = OneHotEncoder(top_n=5) encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set(["col_1_f", "col_1_b", "col_1_c", "col_1_d", "col_1_e", - "col_2_d", "col_2_e", "col_2_a", "col_3_a", - "col_4_1", "col_4_2", "col_4_3"]) + expected_col_names = set( + [ + "col_1_f", + "col_1_b", + "col_1_c", + "col_1_d", + "col_1_e", + "col_2_d", + "col_2_e", + "col_2_a", + "col_3_a", + "col_4_1", + "col_4_2", + "col_4_3", + ] + ) col_names = set(X_t.columns) - assert (col_names == expected_col_names) - assert ([X_t[col].dtype == "uint8" for col in X_t]) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] def test_all_numerical_dtype(): # test that columns with the numerical type are preserved - X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], - "col_2": [3, 2, 5, 1, 3], - "col_3": [0, 0, 1, 3, 2], - "col_4": [2, 4, 1, 4, 0]}) + X = pd.DataFrame( + { + "col_1": [2, 0, 1, 0, 0], + "col_2": [3, 2, 5, 1, 3], + "col_3": [0, 0, 1, 3, 2], + "col_4": [2, 4, 1, 4, 0], + } + ) X_expected = X.copy() encoder = OneHotEncoder(top_n=5) encoder.fit(X) @@ -372,44 +508,51 @@ def test_large_number_of_categories(): X = np.repeat(np.arange(n_categories), frequency_per_category).reshape((-1, 1)) X_extra = np.repeat(np.arange(10) + n_categories, 10).reshape((-1, 1)) X = np.array(np.concatenate([X, X_extra])) - X = pd.DataFrame(X, columns=['cat']) - X['cat'] = X['cat'].astype('category') + X = pd.DataFrame(X, columns=["cat"]) + X["cat"] = X["cat"].astype("category") encoder = OneHotEncoder(top_n=10) encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = ['cat_' + str(200000 + i) for i in range(10)] + expected_col_names = ["cat_" + str(200000 + i) for i in range(10)] assert X_t.shape == (1000100, 10) assert set(expected_col_names) == set(list(X_t.columns)) -@pytest.mark.parametrize('data_type', ['list', 'np', 'pd_no_index', 'pd_index', 'ww']) +@pytest.mark.parametrize("data_type", ["list", "np", "pd_no_index", "pd_index", "ww"]) def test_data_types(data_type): - if data_type == 'list': + if data_type == "list": X = [["a"], ["b"], ["c"]] - elif data_type == 'np': + elif data_type == "np": X = np.array([["a"], ["b"], ["c"]]) - elif data_type == 'pd_no_index': + elif data_type == "pd_no_index": X = pd.DataFrame(["a", "b", "c"]) - elif data_type == 'pd_index': - X = pd.DataFrame(["a", "b", "c"], columns=['0']) - elif data_type == 'ww': + elif data_type == "pd_index": + X = pd.DataFrame(["a", "b", "c"], columns=["0"]) + elif data_type == "ww": X = pd.DataFrame(["a", "b", "c"]) X.ww.init() encoder = OneHotEncoder() encoder.fit(X) X_t = encoder.transform(X) - assert list(X_t.columns) == ['0_a', '0_b', '0_c'] + assert list(X_t.columns) == ["0_a", "0_b", "0_c"] np.testing.assert_array_equal(X_t.to_numpy(), np.identity(3)) -@pytest.mark.parametrize("index", [list(range(-5, 0)), - list(range(100, 105)), - [f"row_{i}" for i in range(5)], - pd.date_range("2020-09-08", periods=5)]) +@pytest.mark.parametrize( + "index", + [ + list(range(-5, 0)), + list(range(100, 105)), + [f"row_{i}" for i in range(5)], + pd.date_range("2020-09-08", periods=5), + ], +) def test_ohe_preserves_custom_index(index): - df = pd.DataFrame({"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)}, - index=index) + df = pd.DataFrame( + {"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)}, + index=index, + ) ohe = OneHotEncoder() new_df = ohe.fit_transform(df) pd.testing.assert_index_equal(new_df.index, df.index) @@ -417,64 +560,77 @@ def test_ohe_preserves_custom_index(index): def test_ohe_categories(): - X = pd.DataFrame({'col_1': ['a'] * 10, - 'col_2': ['a'] * 3 + ['b'] * 3 + ['c'] * 2 + ['d'] * 2}) + X = pd.DataFrame( + {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2} + ) ohe = OneHotEncoder(top_n=2) - with pytest.raises(ComponentNotYetFittedError, match='This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling categories.'): - ohe.categories('col_1') + with pytest.raises( + ComponentNotYetFittedError, + match="This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling categories.", + ): + ohe.categories("col_1") ohe.fit(X) - np.testing.assert_array_equal(ohe.categories('col_1'), np.array(['a'])) - np.testing.assert_array_equal(ohe.categories('col_2'), np.array(['a', 'b'])) - with pytest.raises(ValueError, match='Feature "col_12345" was not provided to one-hot encoder as a training feature'): - ohe.categories('col_12345') + np.testing.assert_array_equal(ohe.categories("col_1"), np.array(["a"])) + np.testing.assert_array_equal(ohe.categories("col_2"), np.array(["a", "b"])) + with pytest.raises( + ValueError, + match='Feature "col_12345" was not provided to one-hot encoder as a training feature', + ): + ohe.categories("col_12345") def test_ohe_get_feature_names(): - X = pd.DataFrame({'col_1': ['a'] * 10, - 'col_2': ['a'] * 3 + ['b'] * 3 + ['c'] * 2 + ['d'] * 2}) + X = pd.DataFrame( + {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2} + ) ohe = OneHotEncoder(top_n=2) - with pytest.raises(ComponentNotYetFittedError, match='This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling get_feature_names.'): + with pytest.raises( + ComponentNotYetFittedError, + match="This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling get_feature_names.", + ): ohe.get_feature_names() ohe.fit(X) - np.testing.assert_array_equal(ohe.get_feature_names(), np.array(['col_1_a', 'col_2_a', 'col_2_b'])) + np.testing.assert_array_equal( + ohe.get_feature_names(), np.array(["col_1_a", "col_2_a", "col_2_b"]) + ) - X = pd.DataFrame({'col_1': ['a'] * 4 + ['b'] * 6, - 'col_2': ['b'] * 3 + ['c'] * 7}) - ohe = OneHotEncoder(drop='if_binary') + X = pd.DataFrame({"col_1": ["a"] * 4 + ["b"] * 6, "col_2": ["b"] * 3 + ["c"] * 7}) + ohe = OneHotEncoder(drop="if_binary") ohe.fit(X) - np.testing.assert_array_equal(ohe.get_feature_names(), np.array(['col_1_a', 'col_2_b'])) + np.testing.assert_array_equal( + ohe.get_feature_names(), np.array(["col_1_a", "col_2_b"]) + ) def test_ohe_features_to_encode(): # Test feature that doesn't need encoding and # feature that needs encoding but is not specified remain untouched - X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], - "col_2": ['a', 'b', 'a', 'c', 'd']}) + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) - encoder = OneHotEncoder(top_n=5, features_to_encode=['col_1']) + encoder = OneHotEncoder(top_n=5, features_to_encode=["col_1"]) encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set(['col_1_0', 'col_1_1', 'col_1_2', 'col_2']) + expected_col_names = set(["col_1_0", "col_1_1", "col_1_2", "col_2"]) col_names = set(X_t.columns) - assert (col_names == expected_col_names) - assert ([X_t[col].dtype == "uint8" for col in X_t]) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] - encoder = OneHotEncoder(top_n=5, features_to_encode=['col_1', 'col_2']) + encoder = OneHotEncoder(top_n=5, features_to_encode=["col_1", "col_2"]) encoder.fit(X) X_t = encoder.transform(X) - expected_col_names = set(['col_1_0', 'col_1_1', 'col_1_2', - 'col_2_a', 'col_2_b', 'col_2_c', 'col_2_d']) + expected_col_names = set( + ["col_1_0", "col_1_1", "col_1_2", "col_2_a", "col_2_b", "col_2_c", "col_2_d"] + ) col_names = set(X_t.columns) - assert (col_names == expected_col_names) - assert ([X_t[col].dtype == "uint8" for col in X_t]) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] def test_ohe_features_to_encode_col_missing(): - X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], - "col_2": ['a', 'b', 'a', 'c', 'd']}) + X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0], "col_2": ["a", "b", "a", "c", "d"]}) - encoder = OneHotEncoder(top_n=5, features_to_encode=['col_3', 'col_4']) + encoder = OneHotEncoder(top_n=5, features_to_encode=["col_3", "col_4"]) with pytest.raises(ValueError, match="Could not find and encode"): encoder.fit(X) @@ -487,13 +643,21 @@ def test_ohe_features_to_encode_no_col_names(): X_t = encoder.transform(X) expected_col_names = set([1, "0_a"]) col_names = set(X_t.columns) - assert (col_names == expected_col_names) - assert ([X_t[col].dtype == "uint8" for col in X_t]) + assert col_names == expected_col_names + assert [X_t[col].dtype == "uint8" for col in X_t] def test_ohe_top_n_categories_always_the_same(): - df = pd.DataFrame({"categories": ["cat_1"] * 5 + ["cat_2"] * 4 + ["cat_3"] * 3 + ["cat_4"] * 3 + ["cat_5"] * 3, - "numbers": range(18)}) + df = pd.DataFrame( + { + "categories": ["cat_1"] * 5 + + ["cat_2"] * 4 + + ["cat_3"] * 3 + + ["cat_4"] * 3 + + ["cat_5"] * 3, + "numbers": range(18), + } + ) def check_df_equality(random_seed): ohe = OneHotEncoder(top_n=4, random_seed=random_seed) @@ -510,26 +674,60 @@ def test_ohe_column_names_unique(): df_transformed = OneHotEncoder().fit_transform(df) assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1"} - df = pd.DataFrame({"A": ["x_y", "z", "z"], "A_x": ["y", "a", "a", ], "A_x_y": ["1", "y", "y"]}) + df = pd.DataFrame( + { + "A": ["x_y", "z", "z"], + "A_x": [ + "y", + "a", + "a", + ], + "A_x_y": ["1", "y", "y"], + } + ) df_transformed = OneHotEncoder().fit_transform(df) # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1", "A_x_y_1_1"} - df = pd.DataFrame({"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]}) + df = pd.DataFrame( + {"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]} + ) df_transformed = OneHotEncoder().fit_transform(df) # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists # category 1 in A_x_y gets mapped to A_x_y_1_2 because A_x_y_1_1 already exists - assert set(df_transformed.columns) == {"A_x_y", "A_z", "A_a", "A_x_y_1", "A_x_y_1_1", "A_x_b", "A_x_y_1_2", "A_x_y_y", "A_x_y_c"} + assert set(df_transformed.columns) == { + "A_x_y", + "A_z", + "A_a", + "A_x_y_1", + "A_x_y_1_1", + "A_x_b", + "A_x_y_1_2", + "A_x_y_y", + "A_x_y_c", + } -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.to_datetime(['20190902', '20200519', '20190607'], format='%Y%m%d')), - pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame( + pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d") + ), + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) def test_ohe_woodwork_custom_overrides_returned_by_components(X_df): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime, Boolean] @@ -549,16 +747,20 @@ def test_ohe_woodwork_custom_overrides_returned_by_components(X_df): def test_ohe_output_bools(): - X = pd.DataFrame({"bool": [bool(i % 2) for i in range(100)], - "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40, - "integers": [i for i in range(100)]}) + X = pd.DataFrame( + { + "bool": [bool(i % 2) for i in range(100)], + "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40, + "integers": [i for i in range(100)], + } + ) X.ww.init() y = pd.Series([i % 2 for i in range(100)]) y.ww.init() ohe = OneHotEncoder() output = ohe.fit_transform(X, y) for name, types in output.ww.types["Logical Type"].items(): - if name == 'integers': + if name == "integers": assert str(types) == "Integer" else: assert str(types) == "Boolean" diff --git a/evalml/tests/component_tests/test_oversamplers.py b/evalml/tests/component_tests/test_oversamplers.py index 31f63c32c8..ee1dfb82e5 100644 --- a/evalml/tests/component_tests/test_oversamplers.py +++ b/evalml/tests/component_tests/test_oversamplers.py @@ -3,14 +3,13 @@ import pytest from evalml.exceptions import ComponentNotYetFittedError -from evalml.pipelines.components import ( - SMOTENCSampler, - SMOTENSampler, - SMOTESampler -) +from evalml.pipelines.components import SMOTENCSampler, SMOTENSampler, SMOTESampler from evalml.utils.woodwork_utils import infer_feature_types -im = pytest.importorskip('imblearn.over_sampling', reason='Skipping test because imbalanced-learn not installed') +im = pytest.importorskip( + "imblearn.over_sampling", + reason="Skipping test because imbalanced-learn not installed", +) @pytest.mark.parametrize("sampler", [SMOTESampler, SMOTENCSampler, SMOTENSampler]) @@ -19,18 +18,22 @@ def test_init(sampler): "sampling_ratio": 0.5, "k_neighbors": 2, "n_jobs": -1, - "sampling_ratio_dict": None + "sampling_ratio_dict": None, } oversampler = sampler(**parameters) assert oversampler.parameters == parameters -@pytest.mark.parametrize("sampler", [SMOTESampler(sampling_ratio=1), - SMOTENCSampler(sampling_ratio=1), - SMOTENSampler(sampling_ratio=1)]) +@pytest.mark.parametrize( + "sampler", + [ + SMOTESampler(sampling_ratio=1), + SMOTENCSampler(sampling_ratio=1), + SMOTENSampler(sampling_ratio=1), + ], +) def test_none_y(sampler): - X = pd.DataFrame({"a": [i for i in range(5)], - "b": [1 for i in range(5)]}) + X = pd.DataFrame({"a": [i for i in range(5)], "b": [1 for i in range(5)]}) X = infer_feature_types(X, feature_types={"a": "Categorical"}) oversampler = sampler with pytest.raises(ValueError, match="y cannot be none"): @@ -41,9 +44,14 @@ def test_none_y(sampler): oversampler.transform(X, None) -@pytest.mark.parametrize("sampler", [SMOTESampler(sampling_ratio=1), - SMOTENCSampler(sampling_ratio=1), - SMOTENSampler(sampling_ratio=1)]) +@pytest.mark.parametrize( + "sampler", + [ + SMOTESampler(sampling_ratio=1), + SMOTENCSampler(sampling_ratio=1), + SMOTENSampler(sampling_ratio=1), + ], +) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_no_oversample(data_type, sampler, make_data_type, X_y_binary): X, y = X_y_binary @@ -63,14 +71,23 @@ def test_no_oversample(data_type, sampler, make_data_type, X_y_binary): np.testing.assert_equal(y, new_y.values) -@pytest.mark.parametrize("sampler", [SMOTESampler(sampling_ratio=1), - SMOTENCSampler(sampling_ratio=1), - SMOTENSampler(sampling_ratio=1)]) +@pytest.mark.parametrize( + "sampler", + [ + SMOTESampler(sampling_ratio=1), + SMOTENCSampler(sampling_ratio=1), + SMOTENSampler(sampling_ratio=1), + ], +) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_oversample_imbalanced_binary(data_type, sampler, make_data_type): - X = np.array([[i for i in range(1000)], - [i % 7 for i in range(1000)], - [0.3 * (i % 3) for i in range(1000)]]).T + X = np.array( + [ + [i for i in range(1000)], + [i % 7 for i in range(1000)], + [0.3 * (i % 3) for i in range(1000)], + ] + ).T y = np.array([0] * 150 + [1] * 850) X = make_data_type(data_type, X) y = make_data_type(data_type, y) @@ -89,7 +106,9 @@ def test_oversample_imbalanced_binary(data_type, sampler, make_data_type): assert len(new_y) == new_length value_counts = new_y.value_counts() assert value_counts.values[0] == value_counts.values[1] - pd.testing.assert_series_equal(value_counts, pd.Series([850, 850]), check_dtype=False) + pd.testing.assert_series_equal( + value_counts, pd.Series([850, 850]), check_dtype=False + ) transform_X, transform_y = oversampler.transform(X, y) @@ -100,16 +119,22 @@ def test_oversample_imbalanced_binary(data_type, sampler, make_data_type): @pytest.mark.parametrize("sampling_ratio", [0.2, 0.5]) @pytest.mark.parametrize("sampler", [SMOTESampler, SMOTENCSampler, SMOTENSampler]) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) -def test_oversample_imbalanced_multiclass(data_type, sampler, sampling_ratio, make_data_type): - X = np.array([[i for i in range(1000)], - [i % 7 for i in range(1000)], - [0.3 * (i % 3) for i in range(1000)]]).T +def test_oversample_imbalanced_multiclass( + data_type, sampler, sampling_ratio, make_data_type +): + X = np.array( + [ + [i for i in range(1000)], + [i % 7 for i in range(1000)], + [0.3 * (i % 3) for i in range(1000)], + ] + ).T y = np.array([0] * 800 + [1] * 100 + [2] * 100) X = make_data_type(data_type, X) y = make_data_type(data_type, y) X2 = X oversampler = sampler(sampling_ratio=sampling_ratio) - if sampler.name == 'SMOTENC Oversampler': + if sampler.name == "SMOTENC Oversampler": X2 = infer_feature_types(X, feature_types={0: "Categorical"}) if data_type == "ww": X2.ww.set_types({0: "Categorical"}) @@ -140,7 +165,7 @@ def test_oversample_seed_same_outputs(sampler, X_y_binary): samplers = [] for seed in [0, 0, 1]: oversampler = sampler(sampling_ratio=1, random_seed=seed) - if 'NC' in sampler.name: + if "NC" in sampler.name: X = infer_feature_types(X, feature_types={1: "Categorical"}) oversampler = sampler(sampling_ratio=1, random_seed=seed) samplers.append(oversampler) @@ -161,13 +186,19 @@ def test_oversample_seed_same_outputs(sampler, X_y_binary): pd.testing.assert_series_equal(y1, y2) -@pytest.mark.parametrize("component_sampler,imblearn_sampler", - [(SMOTESampler, im.SMOTE), - (SMOTENCSampler, im.SMOTENC), - (SMOTENSampler, im.SMOTEN)]) -@pytest.mark.parametrize("problem_type", ['binary', 'multiclass']) -def test_samplers_perform_equally(problem_type, component_sampler, imblearn_sampler, X_y_binary, X_y_multi): - if problem_type == 'binary': +@pytest.mark.parametrize( + "component_sampler,imblearn_sampler", + [ + (SMOTESampler, im.SMOTE), + (SMOTENCSampler, im.SMOTENC), + (SMOTENSampler, im.SMOTEN), + ], +) +@pytest.mark.parametrize("problem_type", ["binary", "multiclass"]) +def test_samplers_perform_equally( + problem_type, component_sampler, imblearn_sampler, X_y_binary, X_y_multi +): + if problem_type == "binary": X, _ = X_y_binary y = np.array([0] * 90 + [1] * 10) imb_learn_sampling_ratio = 0.5 @@ -178,16 +209,30 @@ def test_samplers_perform_equally(problem_type, component_sampler, imblearn_samp imb_learn_sampling_ratio = {0: 70, 1: 35, 2: 35} expected_y = np.array([0] * 70 + [1] * 35 + [2] * 35) sampling_ratio = 0.5 - sampling_dic = {'sampling_ratio': sampling_ratio} + sampling_dic = {"sampling_ratio": sampling_ratio} X2 = X random_seed = 1 if component_sampler != SMOTENCSampler: component = component_sampler(**sampling_dic, random_seed=random_seed) - imb_sampler = imblearn_sampler(sampling_strategy=imb_learn_sampling_ratio, random_state=random_seed) + imb_sampler = imblearn_sampler( + sampling_strategy=imb_learn_sampling_ratio, random_state=random_seed + ) else: - X2 = infer_feature_types(X, feature_types={1: "Categorical", 2: "Categorical", 3: "Categorical", 4: "Categorical"}) + X2 = infer_feature_types( + X, + feature_types={ + 1: "Categorical", + 2: "Categorical", + 3: "Categorical", + 4: "Categorical", + }, + ) component = component_sampler(**sampling_dic, random_seed=random_seed) - imb_sampler = imblearn_sampler(sampling_strategy=imb_learn_sampling_ratio, categorical_features=[1, 2, 3, 4], random_state=random_seed) + imb_sampler = imblearn_sampler( + sampling_strategy=imb_learn_sampling_ratio, + categorical_features=[1, 2, 3, 4], + random_state=random_seed, + ) X_com, y_com = component.fit_transform(X2, y) X_im, y_im = imb_sampler.fit_resample(X, y) @@ -199,7 +244,7 @@ def test_samplers_perform_equally(problem_type, component_sampler, imblearn_samp def test_smotenc_categorical_features(X_y_binary): X, y = X_y_binary - X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'}) + X_ww = infer_feature_types(X, feature_types={0: "Categorical", 1: "Categorical"}) snc = SMOTENCSampler() X_out, y_out = snc.fit_transform(X_ww, y) assert snc.categorical_features == [0, 1] @@ -208,9 +253,11 @@ def test_smotenc_categorical_features(X_y_binary): def test_smotenc_output_shape(X_y_binary): X, y = X_y_binary y_imbalanced = pd.Series([0] * 90 + [1] * 10) - X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'}) + X_ww = infer_feature_types(X, feature_types={0: "Categorical", 1: "Categorical"}) snc = SMOTENCSampler() - with pytest.raises(ComponentNotYetFittedError, match=f'You must fit SMOTENCSampler'): + with pytest.raises( + ComponentNotYetFittedError, match=f"You must fit SMOTENCSampler" + ): snc.transform(X_ww, y) # test sampling and no sampling for y_value in [y, y_imbalanced]: @@ -222,16 +269,27 @@ def test_smotenc_output_shape(X_y_binary): assert X_out.shape[1] == X_ww.shape[1] -@pytest.mark.parametrize("sampling_ratio_dict,expected_dict_values", [({0: 0.5, 1: 1}, {0: 425, 1: 850}), - ({0: 0.1, 1: 1}, {0: 150, 1: 850}), - ({0: 1, 1: 1}, {0: 850, 1: 850}), - ({0: 0.5, 1: 0.1}, {0: 425, 1: 850})]) +@pytest.mark.parametrize( + "sampling_ratio_dict,expected_dict_values", + [ + ({0: 0.5, 1: 1}, {0: 425, 1: 850}), + ({0: 0.1, 1: 1}, {0: 150, 1: 850}), + ({0: 1, 1: 1}, {0: 850, 1: 850}), + ({0: 0.5, 1: 0.1}, {0: 425, 1: 850}), + ], +) @pytest.mark.parametrize("oversampler", [SMOTESampler, SMOTENCSampler, SMOTENSampler]) -def test_oversampler_sampling_dict(oversampler, sampling_ratio_dict, expected_dict_values): - X = np.array([[i for i in range(1000)], - [i % 7 for i in range(1000)], - [0.3 * (i % 3) for i in range(1000)]]).T - X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'}) +def test_oversampler_sampling_dict( + oversampler, sampling_ratio_dict, expected_dict_values +): + X = np.array( + [ + [i for i in range(1000)], + [i % 7 for i in range(1000)], + [0.3 * (i % 3) for i in range(1000)], + ] + ).T + X_ww = infer_feature_types(X, feature_types={0: "Categorical", 1: "Categorical"}) y = np.array([0] * 150 + [1] * 850) overs = oversampler(sampling_ratio_dict=sampling_ratio_dict, random_seed=12) new_X, new_y = overs.fit_transform(X_ww, y) @@ -243,10 +301,14 @@ def test_oversampler_sampling_dict(oversampler, sampling_ratio_dict, expected_di @pytest.mark.parametrize("oversampler", [SMOTESampler, SMOTENCSampler, SMOTENSampler]) def test_oversampler_dictionary_overrides_ratio(oversampler): - X = np.array([[i for i in range(1000)], - [i % 7 for i in range(1000)], - [0.3 * (i % 3) for i in range(1000)]]).T - X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'}) + X = np.array( + [ + [i for i in range(1000)], + [i % 7 for i in range(1000)], + [0.3 * (i % 3) for i in range(1000)], + ] + ).T + X_ww = infer_feature_types(X, feature_types={0: "Categorical", 1: "Categorical"}) y = np.array([0] * 150 + [1] * 850) dictionary = {0: 0.5, 1: 1} expected_result = {0: 425, 1: 850} @@ -259,10 +321,14 @@ def test_oversampler_dictionary_overrides_ratio(oversampler): @pytest.mark.parametrize("oversampler", [SMOTESampler, SMOTENCSampler, SMOTENSampler]) def test_oversampler_sampling_dict_strings(oversampler): - X = np.array([[i for i in range(1000)], - [i % 7 for i in range(1000)], - [0.3 * (i % 3) for i in range(1000)]]).T - X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'}) + X = np.array( + [ + [i for i in range(1000)], + [i % 7 for i in range(1000)], + [0.3 * (i % 3) for i in range(1000)], + ] + ).T + X_ww = infer_feature_types(X, feature_types={0: "Categorical", 1: "Categorical"}) y = np.array(["minority"] * 150 + ["majority"] * 850) dictionary = {"minority": 0.5, "majority": 1} expected_result = {"minority": 425, "majority": 850} diff --git a/evalml/tests/component_tests/test_pca.py b/evalml/tests/component_tests/test_pca.py index 4e1c24d6e6..70875fa258 100644 --- a/evalml/tests/component_tests/test_pca.py +++ b/evalml/tests/component_tests/test_pca.py @@ -8,67 +8,77 @@ from evalml.pipelines.components import PCA -@pytest.mark.parametrize('data_type', ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_pca_numeric(data_type, make_data_type): - X = pd.DataFrame([[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]]) + X = pd.DataFrame( + [[3, 0, 1, 6], [1, 2, 1, 6], [10, 2, 1, 6], [10, 2, 2, 5], [6, 2, 2, 5]] + ) X = make_data_type(data_type, X) pca = PCA() - expected_X_t = pd.DataFrame([[3.176246, 1.282616], - [4.969987, -0.702976], - [-3.954182, 0.429071], - [-4.079174, -0.252790], - [-0.112877, -0.755922]], - columns=[f"component_{i}" for i in range(2)]) + expected_X_t = pd.DataFrame( + [ + [3.176246, 1.282616], + [4.969987, -0.702976], + [-3.954182, 0.429071], + [-4.079174, -0.252790], + [-0.112877, -0.755922], + ], + columns=[f"component_{i}" for i in range(2)], + ) X_t = pca.fit_transform(X) assert_frame_equal(expected_X_t, X_t) def test_pca_array(): - X = np.array([[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]]) + X = np.array( + [[3, 0, 1, 6], [1, 2, 1, 6], [10, 2, 1, 6], [10, 2, 2, 5], [6, 2, 2, 5]] + ) pca = PCA() - expected_X_t = pd.DataFrame([[3.176246, 1.282616], - [4.969987, -0.702976], - [-3.954182, 0.429071], - [-4.079174, -0.252790], - [-0.112877, -0.755922]], - columns=[f"component_{i}" for i in range(2)]) + expected_X_t = pd.DataFrame( + [ + [3.176246, 1.282616], + [4.969987, -0.702976], + [-3.954182, 0.429071], + [-4.079174, -0.252790], + [-0.112877, -0.755922], + ], + columns=[f"component_{i}" for i in range(2)], + ) pca.fit(X) X_t = pca.transform(X) assert_frame_equal(expected_X_t, X_t) def test_pca_invalid(): - X = pd.DataFrame([[3, 0, 1, 6], - [1, None, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, np.nan], - [None, 2, 2, 5]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6], + [1, None, 1, 6], + [10, 2, 1, 6], + [10, 2, 2, np.nan], + [None, 2, 2, 5], + ] + ) pca = PCA() with pytest.raises(ValueError, match="must be all numeric"): pca.fit(X) - X = pd.DataFrame([[3, 0, 1, 6], - ['a', 'b', 'a', 'b'], - [10, 2, 1, 6], - [10, 2, 2, 23], - [0, 2, 2, 5]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6], + ["a", "b", "a", "b"], + [10, 2, 1, 6], + [10, 2, 2, 23], + [0, 2, 2, 5], + ] + ) pca = PCA() with pytest.raises(ValueError, match="must be all numeric"): pca.fit_transform(X) - X_ok = pd.DataFrame([[3, 0, 1, 6], - [1, 2, 1, 6], - [10, 2, 1, 6], - [10, 2, 2, 5], - [6, 2, 2, 5]]) + X_ok = pd.DataFrame( + [[3, 0, 1, 6], [1, 2, 1, 6], [10, 2, 1, 6], [10, 2, 2, 5], [6, 2, 2, 5]] + ) pca = PCA() pca.fit(X_ok) with pytest.raises(ValueError, match="must be all numeric"): @@ -76,18 +86,26 @@ def test_pca_invalid(): def test_variance(): - X = pd.DataFrame([[3, 0, 1, 6, 5, 10], - [1, 2, 1, 3, 11, 4], - [10, 2, 1, 12, 5, 6], - [10, 6, 4, 4, 0, 1], - [6, 8, 9, 3, 1, 5]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6, 5, 10], + [1, 2, 1, 3, 11, 4], + [10, 2, 1, 12, 5, 6], + [10, 6, 4, 4, 0, 1], + [6, 8, 9, 3, 1, 5], + ] + ) pca = PCA(variance=0.97) - expected_X_t = pd.DataFrame([[-5.581732, 0.469307, 3.985657, 1.760273], - [-6.961064, -5.026062, -3.170519, -0.624576], - [-1.352624, 7.778657, -0.778879, -1.554429], - [7.067179, 0.645894, -2.633617, 2.159135], - [6.828241, -3.867796, 2.597358, -1.740404]], - columns=[f"component_{i}" for i in range(4)]) + expected_X_t = pd.DataFrame( + [ + [-5.581732, 0.469307, 3.985657, 1.760273], + [-6.961064, -5.026062, -3.170519, -0.624576], + [-1.352624, 7.778657, -0.778879, -1.554429], + [7.067179, 0.645894, -2.633617, 2.159135], + [6.828241, -3.867796, 2.597358, -1.740404], + ], + columns=[f"component_{i}" for i in range(4)], + ) X_t_90 = pca.fit_transform(X) assert_frame_equal(expected_X_t, X_t_90) @@ -101,11 +119,15 @@ def test_variance(): def test_n_components(): - X = pd.DataFrame([[3, 0, 1, 6, 5, 10], - [1, 2, 1, 3, 11, 4], - [10, 2, 1, 12, 5, 6], - [10, 6, 4, 4, 0, 1], - [6, 8, 9, 3, 1, 5]]) + X = pd.DataFrame( + [ + [3, 0, 1, 6, 5, 10], + [1, 2, 1, 3, 11, 4], + [10, 2, 1, 12, 5, 6], + [10, 6, 4, 4, 0, 1], + [6, 8, 9, 3, 1, 5], + ] + ) pca = PCA(n_components=5) X_t = pca.fit_transform(X) assert X_t.shape[1] == 5 @@ -119,9 +141,14 @@ def test_n_components(): assert X_t.shape[1] == 1 -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + ], +) def test_pca_woodwork_custom_overrides_returned_by_components(X_df): y = pd.Series([1, 2, 1]) override_types = [Integer, Double] @@ -131,4 +158,4 @@ def test_pca_woodwork_custom_overrides_returned_by_components(X_df): pca.fit(X_df) transformed = pca.transform(X_df, y) assert isinstance(transformed, pd.DataFrame) - assert transformed.ww.logical_types == {'component_0': ww.logical_types.Double} + assert transformed.ww.logical_types == {"component_0": ww.logical_types.Double} diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py index f25ed8b9b2..5fa9a75bd4 100644 --- a/evalml/tests/component_tests/test_per_column_imputer.py +++ b/evalml/tests/component_tests/test_per_column_imputer.py @@ -8,7 +8,7 @@ Categorical, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.pipelines.components import PerColumnImputer @@ -16,40 +16,52 @@ @pytest.fixture def non_numeric_df(): - X = pd.DataFrame([["a", "a", "a", "a"], - ["b", "b", "b", "b"], - ["a", "a", "a", "a"], - [np.nan, np.nan, np.nan, np.nan]]) - X.columns = ['A', 'B', 'C', 'D'] + X = pd.DataFrame( + [ + ["a", "a", "a", "a"], + ["b", "b", "b", "b"], + ["a", "a", "a", "a"], + [np.nan, np.nan, np.nan, np.nan], + ] + ) + X.columns = ["A", "B", "C", "D"] return X def test_invalid_parameters(): with pytest.raises(ValueError): - strategies = ("impute_strategy", 'mean') + strategies = ("impute_strategy", "mean") PerColumnImputer(impute_strategies=strategies) with pytest.raises(ValueError): - strategies = ['mean'] + strategies = ["mean"] PerColumnImputer(impute_strategies=strategies) def test_all_strategies(): - X = pd.DataFrame({"A": pd.Series([2, 4, 6, np.nan]), - "B": pd.Series([4, 6, 4, np.nan]), - "C": pd.Series([6, 8, 8, np.nan]), - "D": pd.Series(["a", "a", "b", np.nan])}) - - X_expected = pd.DataFrame({"A": pd.Series([2, 4, 6, 4]), - "B": pd.Series([4, 6, 4, 4]), - "C": pd.Series([6, 8, 8, 100]), - "D": pd.Series(["a", "a", "b", "a"], dtype="category")}) + X = pd.DataFrame( + { + "A": pd.Series([2, 4, 6, np.nan]), + "B": pd.Series([4, 6, 4, np.nan]), + "C": pd.Series([6, 8, 8, np.nan]), + "D": pd.Series(["a", "a", "b", np.nan]), + } + ) + + X_expected = pd.DataFrame( + { + "A": pd.Series([2, 4, 6, 4]), + "B": pd.Series([4, 6, 4, 4]), + "C": pd.Series([6, 8, 8, 100]), + "D": pd.Series(["a", "a", "b", "a"], dtype="category"), + } + ) strategies = { - 'A': {"impute_strategy": "mean"}, - 'B': {"impute_strategy": "median"}, - 'C': {"impute_strategy": "constant", "fill_value": 100}, - 'D': {"impute_strategy": "most_frequent"}, + "A": {"impute_strategy": "mean"}, + "B": {"impute_strategy": "median"}, + "C": {"impute_strategy": "constant", "fill_value": 100}, + "D": {"impute_strategy": "most_frequent"}, } transformer = PerColumnImputer(impute_strategies=strategies) @@ -58,19 +70,13 @@ def test_all_strategies(): def test_fit_transform(): - X = pd.DataFrame([[2], - [4], - [6], - [np.nan]]) + X = pd.DataFrame([[2], [4], [6], [np.nan]]) - X_expected = pd.DataFrame([[2], - [4], - [6], - [4]]) + X_expected = pd.DataFrame([[2], [4], [6], [4]]) - X.columns = ['A'] - X_expected.columns = ['A'] - strategies = {'A': {"impute_strategy": "median"}} + X.columns = ["A"] + X_expected.columns = ["A"] + strategies = {"A": {"impute_strategy": "median"}} transformer = PerColumnImputer(impute_strategies=strategies) transformer.fit(X) @@ -87,20 +93,28 @@ def test_non_numeric_errors(non_numeric_df): X = non_numeric_df # mean with all strings - strategies = {'A': {"impute_strategy": "mean"}} - with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): + strategies = {"A": {"impute_strategy": "mean"}} + with pytest.raises( + ValueError, match="Cannot use mean strategy with non-numeric data" + ): transformer = PerColumnImputer(impute_strategies=strategies) transformer.fit_transform(X) - with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): + with pytest.raises( + ValueError, match="Cannot use mean strategy with non-numeric data" + ): transformer = PerColumnImputer(impute_strategies=strategies) transformer.fit(X) # median with all strings - strategies = {'B': {"impute_strategy": "median"}} - with pytest.raises(ValueError, match="Cannot use median strategy with non-numeric data"): + strategies = {"B": {"impute_strategy": "median"}} + with pytest.raises( + ValueError, match="Cannot use median strategy with non-numeric data" + ): transformer = PerColumnImputer(impute_strategies=strategies) transformer.fit_transform(X) - with pytest.raises(ValueError, match="Cannot use median strategy with non-numeric data"): + with pytest.raises( + ValueError, match="Cannot use median strategy with non-numeric data" + ): transformer = PerColumnImputer(impute_strategies=strategies) transformer.fit(X) @@ -109,89 +123,141 @@ def test_non_numeric_valid(non_numeric_df): X = non_numeric_df # most frequent with all strings - strategies = {'C': {"impute_strategy": "most_frequent"}} + strategies = {"C": {"impute_strategy": "most_frequent"}} transformer = PerColumnImputer(impute_strategies=strategies) - X_expected = pd.DataFrame({"A": pd.Series(["a", "b", "a", "a"], dtype="category"), - "B": pd.Series(["a", "b", "a", "a"], dtype="category"), - "C": pd.Series(["a", "b", "a", "a"], dtype="category"), - "D": pd.Series(["a", "b", "a", "a"], dtype="category")}) + X_expected = pd.DataFrame( + { + "A": pd.Series(["a", "b", "a", "a"], dtype="category"), + "B": pd.Series(["a", "b", "a", "a"], dtype="category"), + "C": pd.Series(["a", "b", "a", "a"], dtype="category"), + "D": pd.Series(["a", "b", "a", "a"], dtype="category"), + } + ) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected, X_t) # constant with all strings - strategies = {'D': {"impute_strategy": "constant", "fill_value": 100}} + strategies = {"D": {"impute_strategy": "constant", "fill_value": 100}} transformer = PerColumnImputer(impute_strategies=strategies) - X_expected = pd.DataFrame([["a", "a", "a", "a"], - ["b", "b", "b", "b"], - ["a", "a", "a", "a"], - ["a", "a", "a", 100]]) - X_expected.columns = ['A', 'B', 'C', 'D'] - X_expected = pd.DataFrame({"A": pd.Series(["a", "b", "a", "a"], dtype="category"), - "B": pd.Series(["a", "b", "a", "a"], dtype="category"), - "C": pd.Series(["a", "b", "a", "a"], dtype="category"), - "D": pd.Series(["a", "b", "a", 100], dtype="category")}) + X_expected = pd.DataFrame( + [ + ["a", "a", "a", "a"], + ["b", "b", "b", "b"], + ["a", "a", "a", "a"], + ["a", "a", "a", 100], + ] + ) + X_expected.columns = ["A", "B", "C", "D"] + X_expected = pd.DataFrame( + { + "A": pd.Series(["a", "b", "a", "a"], dtype="category"), + "B": pd.Series(["a", "b", "a", "a"], dtype="category"), + "C": pd.Series(["a", "b", "a", "a"], dtype="category"), + "D": pd.Series(["a", "b", "a", 100], dtype="category"), + } + ) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected, X_t) def test_fit_transform_drop_all_nan_columns(): - X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]}) - strategies = {'all_nan': {"impute_strategy": "most_frequent"}, - 'some_nan': {"impute_strategy": "most_frequent"}, - 'another_col': {"impute_strategy": "most_frequent"}} + X = pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ) + strategies = { + "all_nan": {"impute_strategy": "most_frequent"}, + "some_nan": {"impute_strategy": "most_frequent"}, + "another_col": {"impute_strategy": "most_frequent"}, + } transformer = PerColumnImputer(impute_strategies=strategies) X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) - assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]})) + assert_frame_equal( + X, + pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ), + ) def test_transform_drop_all_nan_columns(): - X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]}) - strategies = {'all_nan': {"impute_strategy": "most_frequent"}, - 'some_nan': {"impute_strategy": "most_frequent"}, - 'another_col': {"impute_strategy": "most_frequent"}} + X = pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ) + strategies = { + "all_nan": {"impute_strategy": "most_frequent"}, + "some_nan": {"impute_strategy": "most_frequent"}, + "another_col": {"impute_strategy": "most_frequent"}, + } transformer = PerColumnImputer(impute_strategies=strategies) transformer.fit(X) X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}) X_t = transformer.transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) - assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]})) + assert_frame_equal( + X, + pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ), + ) def test_transform_drop_all_nan_columns_empty(): X = pd.DataFrame([[np.nan, np.nan, np.nan]]) - strategies = {'0': {"impute_strategy": "most_frequent"}, } + strategies = { + "0": {"impute_strategy": "most_frequent"}, + } transformer = PerColumnImputer(impute_strategies=strategies) assert transformer.fit_transform(X).empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) - strategies = {'0': {"impute_strategy": "most_frequent"}} + strategies = {"0": {"impute_strategy": "most_frequent"}} transformer = PerColumnImputer(impute_strategies=strategies) transformer.fit(X) assert transformer.transform(X).empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.Series([1, 2, 3], dtype="int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame(pd.Series([1, 2, 3], dtype="int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) @pytest.mark.parametrize("has_nan", [True, False]) -def test_per_column_imputer_woodwork_custom_overrides_returned_by_components(X_df, has_nan): +def test_per_column_imputer_woodwork_custom_overrides_returned_by_components( + X_df, has_nan +): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean] for logical_type in override_types: diff --git a/evalml/tests/component_tests/test_polynomial_detrender.py b/evalml/tests/component_tests/test_polynomial_detrender.py index af133adfa1..6948c507c7 100644 --- a/evalml/tests/component_tests/test_polynomial_detrender.py +++ b/evalml/tests/component_tests/test_polynomial_detrender.py @@ -7,7 +7,9 @@ from evalml.pipelines.components import PolynomialDetrender -pytest.importorskip('sktime', reason='Skipping polynomial detrending tests because sktime not installed') +pytest.importorskip( + "sktime", reason="Skipping polynomial detrending tests because sktime not installed" +) def test_polynomial_detrender_init(): @@ -41,7 +43,7 @@ def test_polynomial_detrender_raises_value_error_target_is_none(ts_data): pdt.inverse_transform(X, None) -@pytest.mark.parametrize("input_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("input_type", ["np", "pd", "ww"]) @pytest.mark.parametrize("use_int_index", [True, False]) @pytest.mark.parametrize("degree", [1, 2, 3]) def test_polynomial_detrender_fit_transform(degree, use_int_index, input_type, ts_data): @@ -54,18 +56,20 @@ def test_polynomial_detrender_fit_transform(degree, use_int_index, input_type, t # Get the expected answer lin_reg = LinearRegression(fit_intercept=True) - features = PolynomialFeatures(degree=degree).fit_transform(np.arange(X_input.shape[0]).reshape(-1, 1)) + features = PolynomialFeatures(degree=degree).fit_transform( + np.arange(X_input.shape[0]).reshape(-1, 1) + ) lin_reg.fit(features, y_input) detrended_values = y_input.values - lin_reg.predict(features) - expected_index = y_input.index if input_type != 'np' else range(y_input.shape[0]) + expected_index = y_input.index if input_type != "np" else range(y_input.shape[0]) expected_answer = pd.Series(detrended_values, index=expected_index) X, y = X_input, y_input - if input_type == 'np': + if input_type == "np": X = X_input.values y = y_input.values - elif input_type == 'ww': + elif input_type == "ww": X = X_input.copy() X.ww.init() y = ww.init_series(y_input.copy()) @@ -99,10 +103,16 @@ def test_polynomial_detrender_needs_monotonic_index(ts_data): X, y = ts_data detrender = PolynomialDetrender(degree=2) - with pytest.raises(ValueError, match="The \\(time\\) index must be sorted \\(monotonically increasing\\)"): + with pytest.raises( + ValueError, + match="The \\(time\\) index must be sorted \\(monotonically increasing\\)", + ): y_shuffled = y.sample(frac=1, replace=False) detrender.fit_transform(X, y_shuffled) - with pytest.raises(NotImplementedError, match="class 'pandas.core.indexes.base.Index'> is not supported"): + with pytest.raises( + NotImplementedError, + match="class 'pandas.core.indexes.base.Index'> is not supported", + ): y_string_index = pd.Series(np.arange(31), index=[f"row_{i}" for i in range(31)]) detrender.fit_transform(X, y_string_index) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index 789d8581d7..e8897d0650 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -8,109 +8,104 @@ Categorical, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.pipelines.components import SimpleImputer def test_simple_imputer_median(): - X = pd.DataFrame([[np.nan, 0, 1, np.nan], - [1, 2, 3, 2], - [10, 2, np.nan, 2], - [10, 2, 5, np.nan], - [6, 2, 7, 0]]) - transformer = SimpleImputer(impute_strategy='median') - X_expected_arr = pd.DataFrame([[8, 0, 1, 2], - [1, 2, 3, 2], - [10, 2, 4, 2], - [10, 2, 5, 2], - [6, 2, 7, 0]]) + X = pd.DataFrame( + [ + [np.nan, 0, 1, np.nan], + [1, 2, 3, 2], + [10, 2, np.nan, 2], + [10, 2, 5, np.nan], + [6, 2, 7, 0], + ] + ) + transformer = SimpleImputer(impute_strategy="median") + X_expected_arr = pd.DataFrame( + [[8, 0, 1, 2], [1, 2, 3, 2], [10, 2, 4, 2], [10, 2, 5, 2], [6, 2, 7, 0]] + ) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) def test_simple_imputer_mean(): - X = pd.DataFrame([[np.nan, 0, 1, np.nan], - [1, 2, 3, 2], - [1, 2, 3, 0]]) + X = pd.DataFrame([[np.nan, 0, 1, np.nan], [1, 2, 3, 2], [1, 2, 3, 0]]) # test impute_strategy - transformer = SimpleImputer(impute_strategy='mean') - X_expected_arr = pd.DataFrame([[1, 0, 1, 1], - [1, 2, 3, 2], - [1, 2, 3, 0]]) + transformer = SimpleImputer(impute_strategy="mean") + X_expected_arr = pd.DataFrame([[1, 0, 1, 1], [1, 2, 3, 2], [1, 2, 3, 0]]) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) def test_simple_imputer_constant(): # test impute strategy is constant and fill value is not specified - X = pd.DataFrame([[np.nan, 0, 1, np.nan], - ["a", 2, np.nan, 3], - ["b", 2, 3, 0]]) - - transformer = SimpleImputer(impute_strategy='constant', fill_value=3) - X_expected_arr = pd.DataFrame([[3, 0, 1, 3], - ["a", 2, 3, 3], - ["b", 2, 3, 0]]) - X_expected_arr = X_expected_arr.astype({0: 'category'}) + X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 3, 0]]) + + transformer = SimpleImputer(impute_strategy="constant", fill_value=3) + X_expected_arr = pd.DataFrame([[3, 0, 1, 3], ["a", 2, 3, 3], ["b", 2, 3, 0]]) + X_expected_arr = X_expected_arr.astype({0: "category"}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) def test_simple_imputer_most_frequent(): - X = pd.DataFrame([[np.nan, 0, 1, np.nan], - ["a", 2, np.nan, 3], - ["b", 2, 1, 0]]) - - transformer = SimpleImputer(impute_strategy='most_frequent') - X_expected_arr = pd.DataFrame([["a", 0, 1, 0], - ["a", 2, 1, 3], - ["b", 2, 1, 0]]) - X_expected_arr = X_expected_arr.astype({0: 'category'}) + X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 1, 0]]) + + transformer = SimpleImputer(impute_strategy="most_frequent") + X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["a", 2, 1, 3], ["b", 2, 1, 0]]) + X_expected_arr = X_expected_arr.astype({0: "category"}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) def test_simple_imputer_col_with_non_numeric(): # test col with all strings - X = pd.DataFrame([["a", 0, 1, np.nan], - ["b", 2, 3, 3], - ["a", 2, 3, 1], - [np.nan, 2, 3, 0]]) - - transformer = SimpleImputer(impute_strategy='mean') - with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): + X = pd.DataFrame( + [["a", 0, 1, np.nan], ["b", 2, 3, 3], ["a", 2, 3, 1], [np.nan, 2, 3, 0]] + ) + + transformer = SimpleImputer(impute_strategy="mean") + with pytest.raises( + ValueError, match="Cannot use mean strategy with non-numeric data" + ): transformer.fit_transform(X) - with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): + with pytest.raises( + ValueError, match="Cannot use mean strategy with non-numeric data" + ): transformer.fit(X) - transformer = SimpleImputer(impute_strategy='median') - with pytest.raises(ValueError, match="Cannot use median strategy with non-numeric data"): + transformer = SimpleImputer(impute_strategy="median") + with pytest.raises( + ValueError, match="Cannot use median strategy with non-numeric data" + ): transformer.fit_transform(X) - with pytest.raises(ValueError, match="Cannot use median strategy with non-numeric data"): + with pytest.raises( + ValueError, match="Cannot use median strategy with non-numeric data" + ): transformer.fit(X) - transformer = SimpleImputer(impute_strategy='most_frequent') - X_expected_arr = pd.DataFrame([["a", 0, 1, 0], - ["b", 2, 3, 3], - ["a", 2, 3, 1], - ["a", 2, 3, 0]]) - X_expected_arr = X_expected_arr.astype({0: 'category'}) + transformer = SimpleImputer(impute_strategy="most_frequent") + X_expected_arr = pd.DataFrame( + [["a", 0, 1, 0], ["b", 2, 3, 3], ["a", 2, 3, 1], ["a", 2, 3, 0]] + ) + X_expected_arr = X_expected_arr.astype({0: "category"}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) - transformer = SimpleImputer(impute_strategy='constant', fill_value=2) - X_expected_arr = pd.DataFrame([["a", 0, 1, 2], - ["b", 2, 3, 3], - ["a", 2, 3, 1], - [2, 2, 3, 0]]) - X_expected_arr = X_expected_arr.astype({0: 'category'}) + transformer = SimpleImputer(impute_strategy="constant", fill_value=2) + X_expected_arr = pd.DataFrame( + [["a", 0, 1, 2], ["b", 2, 3, 3], ["a", 2, 3, 1], [2, 2, 3, 0]] + ) + X_expected_arr = X_expected_arr.astype({0: "category"}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_simple_imputer_all_bool_return_original(data_type, make_data_type): X = pd.DataFrame([True, True, False, True, True], dtype=bool) y = pd.Series([1, 0, 0, 1, 0]) @@ -123,11 +118,11 @@ def test_simple_imputer_all_bool_return_original(data_type, make_data_type): assert_frame_equal(X_expected_arr, X_t) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_simple_imputer_boolean_dtype(data_type, make_data_type): X = pd.DataFrame([True, np.nan, False, np.nan, True]) y = pd.Series([1, 0, 0, 1, 0]) - X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='category') + X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype="category") X = make_data_type(data_type, X) imputer = SimpleImputer() imputer.fit(X, y) @@ -135,17 +130,23 @@ def test_simple_imputer_boolean_dtype(data_type, make_data_type): assert_frame_equal(X_expected_arr, X_t) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_simple_imputer_multitype_with_one_bool(data_type, make_data_type): - X_multi = pd.DataFrame({ - "bool with nan": pd.Series([True, np.nan, False, np.nan, False]), - "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), - }) + X_multi = pd.DataFrame( + { + "bool with nan": pd.Series([True, np.nan, False, np.nan, False]), + "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), + } + ) y = pd.Series([1, 0, 0, 1, 0]) - X_multi_expected_arr = pd.DataFrame({ - "bool with nan": pd.Series([True, False, False, False, False], dtype='category'), - "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), - }) + X_multi_expected_arr = pd.DataFrame( + { + "bool with nan": pd.Series( + [True, False, False, False, False], dtype="category" + ), + "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), + } + ) X_multi = make_data_type(data_type, X_multi) imputer = SimpleImputer() @@ -155,80 +156,106 @@ def test_simple_imputer_multitype_with_one_bool(data_type, make_data_type): def test_simple_imputer_fit_transform_drop_all_nan_columns(): - X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]}) - - transformer = SimpleImputer(impute_strategy='most_frequent') + X = pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ) + + transformer = SimpleImputer(impute_strategy="most_frequent") X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False) - assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]})) + assert_frame_equal( + X, + pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ), + ) def test_simple_imputer_transform_drop_all_nan_columns(): - X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]}) - transformer = SimpleImputer(impute_strategy='most_frequent') + X = pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ) + transformer = SimpleImputer(impute_strategy="most_frequent") transformer.fit(X) X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}) assert_frame_equal(X_expected_arr, transformer.transform(X), check_dtype=False) - assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], - "another_col": [0, 1, 2]})) + assert_frame_equal( + X, + pd.DataFrame( + { + "all_nan": [np.nan, np.nan, np.nan], + "some_nan": [np.nan, 1, 0], + "another_col": [0, 1, 2], + } + ), + ) def test_simple_imputer_transform_drop_all_nan_columns_empty(): X = pd.DataFrame([[np.nan, np.nan, np.nan]]) - transformer = SimpleImputer(impute_strategy='most_frequent') + transformer = SimpleImputer(impute_strategy="most_frequent") assert transformer.fit_transform(X).empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) - transformer = SimpleImputer(impute_strategy='most_frequent') + transformer = SimpleImputer(impute_strategy="most_frequent") transformer.fit(X) assert transformer.transform(X).empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) def test_simple_imputer_numpy_input(): - X = np.array([[np.nan, 0, 1, np.nan], - [np.nan, 2, 3, 2], - [np.nan, 2, 3, 0]]) - transformer = SimpleImputer(impute_strategy='mean') - X_expected_arr = np.array([[0, 1, 1], - [2, 3, 2], - [2, 3, 0]]) + X = np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) + transformer = SimpleImputer(impute_strategy="mean") + X_expected_arr = np.array([[0, 1, 1], [2, 3, 2], [2, 3, 0]]) assert np.allclose(X_expected_arr, transformer.fit_transform(X)) - np.testing.assert_almost_equal(X, np.array([[np.nan, 0, 1, np.nan], - [np.nan, 2, 3, 2], - [np.nan, 2, 3, 0]])) + np.testing.assert_almost_equal( + X, np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) + ) @pytest.mark.parametrize("data_type", ["numeric", "categorical"]) def test_simple_imputer_fill_value(data_type): if data_type == "numeric": - X = pd.DataFrame({ - "some numeric": [np.nan, 1, 0], - "another numeric": [0, np.nan, 2] - }) + X = pd.DataFrame( + {"some numeric": [np.nan, 1, 0], "another numeric": [0, np.nan, 2]} + ) fill_value = -1 - expected = pd.DataFrame({ - "some numeric": [-1, 1, 0], - "another numeric": [0, -1, 2] - }) + expected = pd.DataFrame( + {"some numeric": [-1, 1, 0], "another numeric": [0, -1, 2]} + ) else: - X = pd.DataFrame({ - "categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'), - "object with nan": ["b", "b", np.nan, "c", np.nan] - }) + X = pd.DataFrame( + { + "categorical with nan": pd.Series( + [np.nan, "1", np.nan, "0", "3"], dtype="category" + ), + "object with nan": ["b", "b", np.nan, "c", np.nan], + } + ) fill_value = "fill" - expected = pd.DataFrame({ - "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'), - "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'), - }) + expected = pd.DataFrame( + { + "categorical with nan": pd.Series( + ["fill", "1", "fill", "0", "3"], dtype="category" + ), + "object with nan": pd.Series( + ["b", "b", "fill", "c", "fill"], dtype="category" + ), + } + ) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) imputer.fit(X, y) @@ -241,71 +268,118 @@ def test_simple_imputer_fill_value(data_type): def test_simple_imputer_does_not_reset_index(): - X = pd.DataFrame({'input_val': np.arange(10), 'target': np.arange(10)}) - X.loc[5, 'input_val'] = np.nan + X = pd.DataFrame({"input_val": np.arange(10), "target": np.arange(10)}) + X.loc[5, "input_val"] = np.nan assert X.index.tolist() == list(range(10)) X.drop(0, inplace=True) - y = X.pop('target') - pd.testing.assert_frame_equal(pd.DataFrame({'input_val': [1.0, 2, 3, 4, np.nan, 6, 7, 8, 9]}, - dtype=float, index=list(range(1, 10))), X) + y = X.pop("target") + pd.testing.assert_frame_equal( + pd.DataFrame( + {"input_val": [1.0, 2, 3, 4, np.nan, 6, 7, 8, 9]}, + dtype=float, + index=list(range(1, 10)), + ), + X, + ) imputer = SimpleImputer(impute_strategy="mean") imputer.fit(X, y=y) transformed = imputer.transform(X) - pd.testing.assert_frame_equal(pd.DataFrame({'input_val': [1, 2, 3, 4, 5, 6, 7, 8, 9]}, - dtype=float, - index=list(range(1, 10))), - transformed) + pd.testing.assert_frame_equal( + pd.DataFrame( + {"input_val": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, + dtype=float, + index=list(range(1, 10)), + ), + transformed, + ) def test_simple_imputer_with_none(): - X = pd.DataFrame({"int with None": [1, 0, 5, None], - "float with None": [0.1, 0.0, 0.5, None], - "all None": [None, None, None, None]}) + X = pd.DataFrame( + { + "int with None": [1, 0, 5, None], + "float with None": [0.1, 0.0, 0.5, None], + "all None": [None, None, None, None], + } + ) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="mean") imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({"int with None": [1, 0, 5, 2], - "float with None": [0.1, 0.0, 0.5, 0.2]}) + expected = pd.DataFrame( + {"int with None": [1, 0, 5, 2], "float with None": [0.1, 0.0, 0.5, 0.2]} + ) assert_frame_equal(expected, transformed, check_dtype=False) - X = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", None], dtype='category'), - "boolean with None": pd.Series([True, None, False, True]), - "object with None": ["b", "a", "a", None], - "all None": [None, None, None, None]}) + X = pd.DataFrame( + { + "category with None": pd.Series(["b", "a", "a", None], dtype="category"), + "boolean with None": pd.Series([True, None, False, True]), + "object with None": ["b", "a", "a", None], + "all None": [None, None, None, None], + } + ) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer() imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", "a"], dtype='category'), - "boolean with None": pd.Series([True, True, False, True], dtype='category'), - "object with None": pd.Series(["b", "a", "a", "a"], dtype='category')}) + expected = pd.DataFrame( + { + "category with None": pd.Series(["b", "a", "a", "a"], dtype="category"), + "boolean with None": pd.Series([True, True, False, True], dtype="category"), + "object with None": pd.Series(["b", "a", "a", "a"], dtype="category"), + } + ) assert_frame_equal(expected, transformed, check_dtype=False) def test_simple_imputer_supports_natural_language_constant(): - X = pd.DataFrame({"cat with None": ["a", "b", "a", None], - "natural language col": ["free-form text", "will", "be imputed", None]}) + X = pd.DataFrame( + { + "cat with None": ["a", "b", "a", None], + "natural language col": ["free-form text", "will", "be imputed", None], + } + ) y = pd.Series([0, 0, 1, 0, 1]) X.ww.init(logical_types={"natural language col": "NaturalLanguage"}) imputer = SimpleImputer(impute_strategy="constant", fill_value="placeholder") imputer.fit(X, y) transformed = imputer.transform(X, y) - expected = pd.DataFrame({"cat with None": pd.Series(["a", "b", "a", "placeholder"], dtype='category'), - "natural language col": pd.Series(["free-form text", "will", "be imputed", "placeholder"], dtype='string')}) + expected = pd.DataFrame( + { + "cat with None": pd.Series( + ["a", "b", "a", "placeholder"], dtype="category" + ), + "natural language col": pd.Series( + ["free-form text", "will", "be imputed", "placeholder"], dtype="string" + ), + } + ) assert_frame_equal(expected, transformed, check_dtype=False) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype=bool)), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype=bool)), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) @pytest.mark.parametrize("has_nan", [True, False]) @pytest.mark.parametrize("impute_strategy", ["mean", "median"]) -def test_simple_imputer_woodwork_custom_overrides_returned_by_components(X_df, has_nan, impute_strategy): +def test_simple_imputer_woodwork_custom_overrides_returned_by_components( + X_df, has_nan, impute_strategy +): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean] for logical_type in override_types: diff --git a/evalml/tests/component_tests/test_stacked_ensemble_classifier.py b/evalml/tests/component_tests/test_stacked_ensemble_classifier.py index 466d8a7773..292ebd1466 100644 --- a/evalml/tests/component_tests/test_stacked_ensemble_classifier.py +++ b/evalml/tests/component_tests/test_stacked_ensemble_classifier.py @@ -8,12 +8,9 @@ from evalml.model_family import ModelFamily from evalml.pipelines import ( BinaryClassificationPipeline, - MulticlassClassificationPipeline -) -from evalml.pipelines.components import ( - BaselineClassifier, - RandomForestClassifier + MulticlassClassificationPipeline, ) +from evalml.pipelines.components import BaselineClassifier, RandomForestClassifier from evalml.pipelines.components.ensemble import StackedEnsembleClassifier from evalml.problem_types import ProblemTypes @@ -23,42 +20,60 @@ def test_stacked_model_family(): def test_stacked_default_parameters(): - assert StackedEnsembleClassifier.default_parameters == {'final_estimator': None, - 'cv': None, - 'n_jobs': -1 - } + assert StackedEnsembleClassifier.default_parameters == { + "final_estimator": None, + "cv": None, + "n_jobs": -1, + } def test_stacked_ensemble_init_with_invalid_estimators_parameter(): - with pytest.raises(EnsembleMissingPipelinesError, match='must not be None or an empty list.'): + with pytest.raises( + EnsembleMissingPipelinesError, match="must not be None or an empty list." + ): StackedEnsembleClassifier() - with pytest.raises(EnsembleMissingPipelinesError, match='must not be None or an empty list.'): + with pytest.raises( + EnsembleMissingPipelinesError, match="must not be None or an empty list." + ): StackedEnsembleClassifier(input_pipelines=[]) def test_stacked_ensemble_nonstackable_model_families(): - with pytest.raises(ValueError, match="Pipelines with any of the following model families cannot be used as base pipelines"): - StackedEnsembleClassifier(input_pipelines=[BinaryClassificationPipeline([BaselineClassifier])]) + with pytest.raises( + ValueError, + match="Pipelines with any of the following model families cannot be used as base pipelines", + ): + StackedEnsembleClassifier( + input_pipelines=[BinaryClassificationPipeline([BaselineClassifier])] + ) def test_stacked_different_input_pipelines_classification(): - input_pipelines = [BinaryClassificationPipeline([RandomForestClassifier]), - MulticlassClassificationPipeline([RandomForestClassifier])] - with pytest.raises(ValueError, match="All pipelines must have the same problem type."): + input_pipelines = [ + BinaryClassificationPipeline([RandomForestClassifier]), + MulticlassClassificationPipeline([RandomForestClassifier]), + ] + with pytest.raises( + ValueError, match="All pipelines must have the same problem type." + ): StackedEnsembleClassifier(input_pipelines=input_pipelines) -def test_stacked_ensemble_init_with_multiple_same_estimators(X_y_binary, logistic_regression_binary_pipeline_class): +def test_stacked_ensemble_init_with_multiple_same_estimators( + X_y_binary, logistic_regression_binary_pipeline_class +): # Checks that it is okay to pass multiple of the same type of estimator X, y = X_y_binary - input_pipelines = [logistic_regression_binary_pipeline_class(parameters={}), - logistic_regression_binary_pipeline_class(parameters={})] + input_pipelines = [ + logistic_regression_binary_pipeline_class(parameters={}), + logistic_regression_binary_pipeline_class(parameters={}), + ] clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1) expected_parameters = { "input_pipelines": input_pipelines, "final_estimator": None, - 'cv': None, - 'n_jobs': 1 + "cv": None, + "n_jobs": 1, } assert clf.parameters == expected_parameters @@ -70,15 +85,17 @@ def test_stacked_ensemble_init_with_multiple_same_estimators(X_y_binary, logisti assert not np.isnan(y_pred).all() -def test_stacked_ensemble_n_jobs_negative_one(X_y_binary, logistic_regression_binary_pipeline_class): +def test_stacked_ensemble_n_jobs_negative_one( + X_y_binary, logistic_regression_binary_pipeline_class +): X, y = X_y_binary input_pipelines = [logistic_regression_binary_pipeline_class(parameters={})] clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=-1) expected_parameters = { "input_pipelines": input_pipelines, "final_estimator": None, - 'cv': None, - 'n_jobs': -1 + "cv": None, + "n_jobs": -1, } assert clf.parameters == expected_parameters clf.fit(X, y) @@ -87,13 +104,20 @@ def test_stacked_ensemble_n_jobs_negative_one(X_y_binary, logistic_regression_bi assert not np.isnan(y_pred).all() -@patch('evalml.pipelines.components.ensemble.StackedEnsembleClassifier._stacking_estimator_class') -def test_stacked_ensemble_does_not_overwrite_pipeline_random_seed(mock_stack, - logistic_regression_binary_pipeline_class): - input_pipelines = [logistic_regression_binary_pipeline_class(parameters={}, random_seed=3), - logistic_regression_binary_pipeline_class(parameters={}, random_seed=4)] - clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, random_seed=5, n_jobs=1) - estimators_used_in_ensemble = mock_stack.call_args[1]['estimators'] +@patch( + "evalml.pipelines.components.ensemble.StackedEnsembleClassifier._stacking_estimator_class" +) +def test_stacked_ensemble_does_not_overwrite_pipeline_random_seed( + mock_stack, logistic_regression_binary_pipeline_class +): + input_pipelines = [ + logistic_regression_binary_pipeline_class(parameters={}, random_seed=3), + logistic_regression_binary_pipeline_class(parameters={}, random_seed=4), + ] + clf = StackedEnsembleClassifier( + input_pipelines=input_pipelines, random_seed=5, n_jobs=1 + ) + estimators_used_in_ensemble = mock_stack.call_args[1]["estimators"] assert clf.random_seed == 5 assert estimators_used_in_ensemble[0][1].pipeline.random_seed == 3 assert estimators_used_in_ensemble[1][1].pipeline.random_seed == 4 @@ -103,10 +127,15 @@ def test_stacked_ensemble_multilevel(logistic_regression_binary_pipeline_class): # checks passing a stacked ensemble classifier as a final estimator X = pd.DataFrame(np.random.rand(50, 5)) y = pd.Series([1, 0] * 25) - base = StackedEnsembleClassifier(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})], n_jobs=1) - clf = StackedEnsembleClassifier(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})], - final_estimator=base, - n_jobs=1) + base = StackedEnsembleClassifier( + input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})], + n_jobs=1, + ) + clf = StackedEnsembleClassifier( + input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})], + final_estimator=base, + n_jobs=1, + ) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) @@ -116,13 +145,18 @@ def test_stacked_ensemble_multilevel(logistic_regression_binary_pipeline_class): def test_stacked_problem_types(): assert ProblemTypes.BINARY in StackedEnsembleClassifier.supported_problem_types assert ProblemTypes.MULTICLASS in StackedEnsembleClassifier.supported_problem_types - assert StackedEnsembleClassifier.supported_problem_types == [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS] + assert StackedEnsembleClassifier.supported_problem_types == [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] @pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_classifiers, problem_type): +def test_stacked_fit_predict_classification( + X_y_binary, X_y_multi, stackable_classifiers, problem_type +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary num_classes = 2 @@ -131,7 +165,9 @@ def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_cla X, y = X_y_multi num_classes = 3 pipeline_class = MulticlassClassificationPipeline - input_pipelines = [pipeline_class([classifier]) for classifier in stackable_classifiers] + input_pipelines = [ + pipeline_class([classifier]) for classifier in stackable_classifiers + ] clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1) clf.fit(X, y) y_pred = clf.predict(X) @@ -144,7 +180,11 @@ def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_cla assert y_pred_proba.shape == (len(y), num_classes) assert not np.isnan(y_pred_proba).all().all() - clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, final_estimator=RandomForestClassifier(), n_jobs=1) + clf = StackedEnsembleClassifier( + input_pipelines=input_pipelines, + final_estimator=RandomForestClassifier(), + n_jobs=1, + ) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) @@ -158,18 +198,24 @@ def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_cla @pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -@patch('evalml.pipelines.components.ensemble.StackedEnsembleClassifier.fit') -def test_stacked_feature_importance(mock_fit, X_y_binary, X_y_multi, stackable_classifiers, problem_type): +@patch("evalml.pipelines.components.ensemble.StackedEnsembleClassifier.fit") +def test_stacked_feature_importance( + mock_fit, X_y_binary, X_y_multi, stackable_classifiers, problem_type +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary pipeline_class = BinaryClassificationPipeline elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi pipeline_class = MulticlassClassificationPipeline - input_pipelines = [pipeline_class([classifier]) for classifier in stackable_classifiers] + input_pipelines = [ + pipeline_class([classifier]) for classifier in stackable_classifiers + ] clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1) clf.fit(X, y) mock_fit.assert_called() clf._is_fitted = True - with pytest.raises(NotImplementedError, match="feature_importance is not implemented"): + with pytest.raises( + NotImplementedError, match="feature_importance is not implemented" + ): clf.feature_importance diff --git a/evalml/tests/component_tests/test_stacked_ensemble_regressor.py b/evalml/tests/component_tests/test_stacked_ensemble_regressor.py index 50b12e2783..6224b84b2e 100644 --- a/evalml/tests/component_tests/test_stacked_ensemble_regressor.py +++ b/evalml/tests/component_tests/test_stacked_ensemble_regressor.py @@ -11,7 +11,7 @@ BaselineRegressor, RandomForestClassifier, RandomForestRegressor, - StackedEnsembleRegressor + StackedEnsembleRegressor, ) from evalml.problem_types import ProblemTypes @@ -21,42 +21,60 @@ def test_stacked_model_family(): def test_stacked_default_parameters(): - assert StackedEnsembleRegressor.default_parameters == {'final_estimator': None, - 'cv': None, - 'n_jobs': -1 - } + assert StackedEnsembleRegressor.default_parameters == { + "final_estimator": None, + "cv": None, + "n_jobs": -1, + } def test_stacked_ensemble_init_with_invalid_estimators_parameter(): - with pytest.raises(EnsembleMissingPipelinesError, match='must not be None or an empty list.'): + with pytest.raises( + EnsembleMissingPipelinesError, match="must not be None or an empty list." + ): StackedEnsembleRegressor() - with pytest.raises(EnsembleMissingPipelinesError, match='must not be None or an empty list.'): + with pytest.raises( + EnsembleMissingPipelinesError, match="must not be None or an empty list." + ): StackedEnsembleRegressor(input_pipelines=[]) def test_stacked_ensemble_nonstackable_model_families(): - with pytest.raises(ValueError, match="Pipelines with any of the following model families cannot be used as base pipelines"): - StackedEnsembleRegressor(input_pipelines=[RegressionPipeline([BaselineRegressor])]) + with pytest.raises( + ValueError, + match="Pipelines with any of the following model families cannot be used as base pipelines", + ): + StackedEnsembleRegressor( + input_pipelines=[RegressionPipeline([BaselineRegressor])] + ) def test_stacked_different_input_pipelines_regression(): - input_pipelines = [RegressionPipeline([RandomForestRegressor]), - BinaryClassificationPipeline([RandomForestClassifier])] - with pytest.raises(ValueError, match="All pipelines must have the same problem type."): + input_pipelines = [ + RegressionPipeline([RandomForestRegressor]), + BinaryClassificationPipeline([RandomForestClassifier]), + ] + with pytest.raises( + ValueError, match="All pipelines must have the same problem type." + ): StackedEnsembleRegressor(input_pipelines=input_pipelines) -def test_stacked_ensemble_init_with_multiple_same_estimators(X_y_regression, linear_regression_pipeline_class): +def test_stacked_ensemble_init_with_multiple_same_estimators( + X_y_regression, linear_regression_pipeline_class +): # Checks that it is okay to pass multiple of the same type of estimator X, y = X_y_regression - input_pipelines = [linear_regression_pipeline_class(parameters={}), - linear_regression_pipeline_class(parameters={})] + input_pipelines = [ + linear_regression_pipeline_class(parameters={}), + linear_regression_pipeline_class(parameters={}), + ] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1) expected_parameters = { "input_pipelines": input_pipelines, "final_estimator": None, - 'cv': None, - 'n_jobs': 1 + "cv": None, + "n_jobs": 1, } assert clf.parameters == expected_parameters @@ -68,15 +86,17 @@ def test_stacked_ensemble_init_with_multiple_same_estimators(X_y_regression, lin assert not np.isnan(y_pred).all() -def test_stacked_ensemble_n_jobs_negative_one(X_y_regression, linear_regression_pipeline_class): +def test_stacked_ensemble_n_jobs_negative_one( + X_y_regression, linear_regression_pipeline_class +): X, y = X_y_regression input_pipelines = [linear_regression_pipeline_class(parameters={})] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines) expected_parameters = { "input_pipelines": input_pipelines, "final_estimator": None, - 'cv': None, - 'n_jobs': -1 + "cv": None, + "n_jobs": -1, } assert clf.parameters == expected_parameters clf.fit(X, y) @@ -85,13 +105,20 @@ def test_stacked_ensemble_n_jobs_negative_one(X_y_regression, linear_regression_ assert not np.isnan(y_pred).all() -@patch('evalml.pipelines.components.ensemble.StackedEnsembleRegressor._stacking_estimator_class') -def test_stacked_ensemble_does_not_overwrite_pipeline_random_seed(mock_stack, - linear_regression_pipeline_class): - input_pipelines = [linear_regression_pipeline_class(parameters={}, random_seed=3), - linear_regression_pipeline_class(parameters={}, random_seed=4)] - clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, random_seed=5, n_jobs=1) - estimators_used_in_ensemble = mock_stack.call_args[1]['estimators'] +@patch( + "evalml.pipelines.components.ensemble.StackedEnsembleRegressor._stacking_estimator_class" +) +def test_stacked_ensemble_does_not_overwrite_pipeline_random_seed( + mock_stack, linear_regression_pipeline_class +): + input_pipelines = [ + linear_regression_pipeline_class(parameters={}, random_seed=3), + linear_regression_pipeline_class(parameters={}, random_seed=4), + ] + clf = StackedEnsembleRegressor( + input_pipelines=input_pipelines, random_seed=5, n_jobs=1 + ) + estimators_used_in_ensemble = mock_stack.call_args[1]["estimators"] assert clf.random_seed == 5 assert estimators_used_in_ensemble[0][1].pipeline.random_seed == 3 assert estimators_used_in_ensemble[1][1].pipeline.random_seed == 4 @@ -100,11 +127,19 @@ def test_stacked_ensemble_does_not_overwrite_pipeline_random_seed(mock_stack, def test_stacked_ensemble_multilevel(linear_regression_pipeline_class): # checks passing a stacked ensemble classifier as a final estimator X = pd.DataFrame(np.random.rand(50, 5)) - y = pd.Series(np.random.rand(50,)) - base = StackedEnsembleRegressor(input_pipelines=[linear_regression_pipeline_class(parameters={})], n_jobs=1) - clf = StackedEnsembleRegressor(input_pipelines=[linear_regression_pipeline_class(parameters={})], - final_estimator=base, - n_jobs=1) + y = pd.Series( + np.random.rand( + 50, + ) + ) + base = StackedEnsembleRegressor( + input_pipelines=[linear_regression_pipeline_class(parameters={})], n_jobs=1 + ) + clf = StackedEnsembleRegressor( + input_pipelines=[linear_regression_pipeline_class(parameters={})], + final_estimator=base, + n_jobs=1, + ) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) @@ -118,7 +153,9 @@ def test_stacked_problem_types(): def test_stacked_fit_predict_regression(X_y_regression, stackable_regressors): X, y = X_y_regression - input_pipelines = [RegressionPipeline([regressor]) for regressor in stackable_regressors] + input_pipelines = [ + RegressionPipeline([regressor]) for regressor in stackable_regressors + ] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1) clf.fit(X, y) y_pred = clf.predict(X) @@ -126,7 +163,11 @@ def test_stacked_fit_predict_regression(X_y_regression, stackable_regressors): assert isinstance(y_pred, pd.Series) assert not np.isnan(y_pred).all() - clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, final_estimator=RandomForestRegressor(), n_jobs=1) + clf = StackedEnsembleRegressor( + input_pipelines=input_pipelines, + final_estimator=RandomForestRegressor(), + n_jobs=1, + ) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) @@ -134,13 +175,17 @@ def test_stacked_fit_predict_regression(X_y_regression, stackable_regressors): assert not np.isnan(y_pred).all() -@patch('evalml.pipelines.components.ensemble.StackedEnsembleRegressor.fit') +@patch("evalml.pipelines.components.ensemble.StackedEnsembleRegressor.fit") def test_stacked_feature_importance(mock_fit, X_y_regression, stackable_regressors): X, y = X_y_regression - input_pipelines = [RegressionPipeline([regressor]) for regressor in stackable_regressors] + input_pipelines = [ + RegressionPipeline([regressor]) for regressor in stackable_regressors + ] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1) clf.fit(X, y) mock_fit.assert_called() clf._is_fitted = True - with pytest.raises(NotImplementedError, match="feature_importance is not implemented"): + with pytest.raises( + NotImplementedError, match="feature_importance is not implemented" + ): clf.feature_importance diff --git a/evalml/tests/component_tests/test_standard_scaler.py b/evalml/tests/component_tests/test_standard_scaler.py index 48d02c47e9..de5a2f1376 100644 --- a/evalml/tests/component_tests/test_standard_scaler.py +++ b/evalml/tests/component_tests/test_standard_scaler.py @@ -6,9 +6,14 @@ from evalml.pipelines.components import StandardScaler -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + ], +) def test_standard_scaler_woodwork_custom_overrides_returned_by_components(X_df): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, Boolean] diff --git a/evalml/tests/component_tests/test_svm_classifier.py b/evalml/tests/component_tests/test_svm_classifier.py index 0c6ee2d831..52c4d1e20c 100644 --- a/evalml/tests/component_tests/test_svm_classifier.py +++ b/evalml/tests/component_tests/test_svm_classifier.py @@ -12,9 +12,12 @@ def test_model_family(): def test_problem_types(): - assert set(SVMClassifier.supported_problem_types) == {ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS} + assert set(SVMClassifier.supported_problem_types) == { + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + } def test_fit_predict_binary(X_y_binary): @@ -53,7 +56,7 @@ def test_fit_predict_multi(X_y_multi): np.testing.assert_almost_equal(y_pred_proba.values, y_pred_proba_sk, decimal=5) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'sigmoid']) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "sigmoid"]) def test_feature_importance(kernel, X_y_binary): X, y = X_y_binary @@ -61,7 +64,7 @@ def test_feature_importance(kernel, X_y_binary): sk_svc = SVC(kernel=kernel, random_state=0) sk_svc.fit(X, y) - if kernel == 'linear': + if kernel == "linear": sk_feature_importance = sk_svc.coef_ else: sk_feature_importance = np.zeros(sk_svc.n_features_in_) diff --git a/evalml/tests/component_tests/test_svm_regressor.py b/evalml/tests/component_tests/test_svm_regressor.py index 3d0dd406e3..27a1ae3fe8 100644 --- a/evalml/tests/component_tests/test_svm_regressor.py +++ b/evalml/tests/component_tests/test_svm_regressor.py @@ -12,7 +12,10 @@ def test_model_family(): def test_problem_types(): - assert set(SVMRegressor.supported_problem_types) == {ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION} + assert set(SVMRegressor.supported_problem_types) == { + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + } def test_fit_predict_regression(X_y_regression): @@ -29,7 +32,7 @@ def test_fit_predict_regression(X_y_regression): np.testing.assert_almost_equal(y_pred.values, y_pred_sk, decimal=5) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'sigmoid']) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "sigmoid"]) def test_feature_importance(kernel, X_y_regression): X, y = X_y_regression @@ -37,7 +40,7 @@ def test_feature_importance(kernel, X_y_regression): sk_svr = SVR(kernel=kernel) sk_svr.fit(X, y) - if kernel == 'linear': + if kernel == "linear": sk_feature_importance = sk_svr.coef_ else: sk_feature_importance = np.zeros(sk_svr.n_features_in_) diff --git a/evalml/tests/component_tests/test_target_encoder.py b/evalml/tests/component_tests/test_target_encoder.py index 50f5d3acf7..6a8ec42fb1 100644 --- a/evalml/tests/component_tests/test_target_encoder.py +++ b/evalml/tests/component_tests/test_target_encoder.py @@ -12,30 +12,36 @@ Datetime, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.exceptions import ComponentNotYetFittedError from evalml.pipelines.components import TargetEncoder -importorskip('category_encoders', reason='Skipping test because category_encoders not installed') +importorskip( + "category_encoders", reason="Skipping test because category_encoders not installed" +) def test_init(): - parameters = {"cols": None, - "smoothing": 1.0, - "handle_unknown": "value", - "handle_missing": "value"} + parameters = { + "cols": None, + "smoothing": 1.0, + "handle_unknown": "value", + "handle_missing": "value", + } encoder = TargetEncoder() assert encoder.parameters == parameters def test_parameters(): - encoder = TargetEncoder(cols=['a']) - expected_parameters = {"cols": ['a'], - "smoothing": 1.0, - "handle_unknown": "value", - "handle_missing": "value"} + encoder = TargetEncoder(cols=["a"]) + expected_parameters = { + "cols": ["a"], + "smoothing": 1.0, + "handle_unknown": "value", + "handle_missing": "value", + } assert encoder.parameters == expected_parameters @@ -47,60 +53,94 @@ def test_categories(): def test_invalid_inputs(): with pytest.raises(ValueError, match="Invalid input 'test' for handle_unknown"): - TargetEncoder(handle_unknown='test') + TargetEncoder(handle_unknown="test") with pytest.raises(ValueError, match="Invalid input 'test2' for handle_missing"): - TargetEncoder(handle_missing='test2') - with pytest.raises(ValueError, match="Smoothing value needs to be strictly larger than 0"): + TargetEncoder(handle_missing="test2") + with pytest.raises( + ValueError, match="Smoothing value needs to be strictly larger than 0" + ): TargetEncoder(smoothing=0) def test_null_values_in_dataframe(): - X = pd.DataFrame({'col_1': ["a", "b", "c", "d", np.nan], - 'col_2': ["a", "b", "a", "c", "b"], - 'col_3': ["a", "a", "a", "a", "a"]}) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", np.nan], + "col_2": ["a", "b", "a", "c", "b"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) y = pd.Series([0, 1, 1, 1, 0]) - encoder = TargetEncoder(handle_missing='value') + encoder = TargetEncoder(handle_missing="value") encoder.fit(X, y) X_t = encoder.transform(X) - X_expected = pd.DataFrame({'col_1': [0.6, 0.6, 0.6, 0.6, 0.6], - 'col_2': [0.526894, 0.526894, 0.526894, 0.6, 0.526894], - 'col_3': [0.6, 0.6, 0.6, 0.6, 0.6, ]}) + X_expected = pd.DataFrame( + { + "col_1": [0.6, 0.6, 0.6, 0.6, 0.6], + "col_2": [0.526894, 0.526894, 0.526894, 0.6, 0.526894], + "col_3": [ + 0.6, + 0.6, + 0.6, + 0.6, + 0.6, + ], + } + ) assert_frame_equal(X_expected, X_t) - encoder = TargetEncoder(handle_missing='return_nan') + encoder = TargetEncoder(handle_missing="return_nan") encoder.fit(X, y) X_t = encoder.transform(X) - X_expected = pd.DataFrame({'col_1': [0.6, 0.6, 0.6, 0.6, np.nan], - 'col_2': [0.526894, 0.526894, 0.526894, 0.6, 0.526894], - 'col_3': [0.6, 0.6, 0.6, 0.6, 0.6, ]}) + X_expected = pd.DataFrame( + { + "col_1": [0.6, 0.6, 0.6, 0.6, np.nan], + "col_2": [0.526894, 0.526894, 0.526894, 0.6, 0.526894], + "col_3": [ + 0.6, + 0.6, + 0.6, + 0.6, + 0.6, + ], + } + ) assert_frame_equal(X_expected, X_t) - encoder = TargetEncoder(handle_missing='error') - with pytest.raises(ValueError, match='Columns to be encoded can not contain null'): + encoder = TargetEncoder(handle_missing="error") + with pytest.raises(ValueError, match="Columns to be encoded can not contain null"): encoder.fit(X, y) def test_cols(): - X = pd.DataFrame({'col_1': [1, 2, 1, 1, 2], - 'col_2': ['2', '1', '1', '1', '1'], - 'col_3': ["a", "a", "a", "a", "a"]}) - X_expected = X.astype({'col_1': 'int64', 'col_2': 'category', 'col_3': 'category'}) + X = pd.DataFrame( + { + "col_1": [1, 2, 1, 1, 2], + "col_2": ["2", "1", "1", "1", "1"], + "col_3": ["a", "a", "a", "a", "a"], + } + ) + X_expected = X.astype({"col_1": "int64", "col_2": "category", "col_3": "category"}) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder(cols=[]) encoder.fit(X, y) X_t = encoder.transform(X) assert_frame_equal(X_expected, X_t) - encoder = TargetEncoder(cols=['col_2']) + encoder = TargetEncoder(cols=["col_2"]) encoder.fit(X, y) X_t = encoder.transform(X) - X_expected = pd.DataFrame({'col_1': pd.Series([1, 2, 1, 1, 2], dtype="int64"), - 'col_2': [0.60000, 0.742886, 0.742886, 0.742886, 0.742886], - 'col_3': pd.Series(["a", "a", "a", "a", "a"], dtype="category")}) + X_expected = pd.DataFrame( + { + "col_1": pd.Series([1, 2, 1, 1, 2], dtype="int64"), + "col_2": [0.60000, 0.742886, 0.742886, 0.742886, 0.742886], + "col_3": pd.Series(["a", "a", "a", "a", "a"], dtype="category"), + } + ) assert_frame_equal(X_expected, X_t, check_less_precise=True) - encoder = TargetEncoder(cols=['col_2', 'col_3']) + encoder = TargetEncoder(cols=["col_2", "col_3"]) encoder.fit(X, y) X_t = encoder.transform(X) encoder2 = TargetEncoder() @@ -110,63 +150,96 @@ def test_cols(): def test_transform(): - X = pd.DataFrame({'col_1': [1, 2, 1, 1, 2], - 'col_2': ["r", "t", "s", "t", "t"], - 'col_3': ["a", "a", "a", "b", "a"]}) + X = pd.DataFrame( + { + "col_1": [1, 2, 1, 1, 2], + "col_2": ["r", "t", "s", "t", "t"], + "col_3": ["a", "a", "a", "b", "a"], + } + ) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder() encoder.fit(X, y) X_t = encoder.transform(X) - X_expected = pd.DataFrame({'col_1': pd.Series([1, 2, 1, 1, 2], dtype="int64"), - 'col_2': [0.6, 0.65872, 0.6, 0.65872, 0.65872], - 'col_3': [0.504743, 0.504743, 0.504743, 0.6, 0.504743]}) + X_expected = pd.DataFrame( + { + "col_1": pd.Series([1, 2, 1, 1, 2], dtype="int64"), + "col_2": [0.6, 0.65872, 0.6, 0.65872, 0.65872], + "col_3": [0.504743, 0.504743, 0.504743, 0.6, 0.504743], + } + ) assert_frame_equal(X_expected, X_t) def test_smoothing(): # larger smoothing values should bring the values closer to the global mean - X = pd.DataFrame({'col_1': [1, 2, 1, 1, 2], - 'col_2': [2, 1, 1, 1, 1], - 'col_3': ["a", "a", "a", "a", "b"]}) + X = pd.DataFrame( + { + "col_1": [1, 2, 1, 1, 2], + "col_2": [2, 1, 1, 1, 1], + "col_3": ["a", "a", "a", "a", "b"], + } + ) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder(smoothing=1) encoder.fit(X, y) X_t = encoder.transform(X) - X_expected = pd.DataFrame({'col_1': pd.Series([1, 2, 1, 1, 2], dtype="int64"), - 'col_2': pd.Series([2, 1, 1, 1, 1], dtype="int64"), - 'col_3': [0.742886, 0.742886, 0.742886, 0.742886, 0.6]}) + X_expected = pd.DataFrame( + { + "col_1": pd.Series([1, 2, 1, 1, 2], dtype="int64"), + "col_2": pd.Series([2, 1, 1, 1, 1], dtype="int64"), + "col_3": [0.742886, 0.742886, 0.742886, 0.742886, 0.6], + } + ) assert_frame_equal(X_expected, X_t) encoder = TargetEncoder(smoothing=10) encoder.fit(X, y) X_t = encoder.transform(X) - X_expected = pd.DataFrame({'col_1': pd.Series([1, 2, 1, 1, 2], dtype="int64"), - 'col_2': pd.Series([2, 1, 1, 1, 1], dtype="int64"), - 'col_3': [0.686166, 0.686166, 0.686166, 0.686166, 0.6]}) + X_expected = pd.DataFrame( + { + "col_1": pd.Series([1, 2, 1, 1, 2], dtype="int64"), + "col_2": pd.Series([2, 1, 1, 1, 1], dtype="int64"), + "col_3": [0.686166, 0.686166, 0.686166, 0.686166, 0.6], + } + ) assert_frame_equal(X_expected, X_t) encoder = TargetEncoder(smoothing=100) encoder.fit(X, y) X_t = encoder.transform(X) - X_expected = pd.DataFrame({'col_1': pd.Series([1, 2, 1, 1, 2], dtype="int64"), - 'col_2': pd.Series([2, 1, 1, 1, 1], dtype="int64"), - 'col_3': [0.676125, 0.676125, 0.676125, 0.676125, 0.6]}) + X_expected = pd.DataFrame( + { + "col_1": pd.Series([1, 2, 1, 1, 2], dtype="int64"), + "col_2": pd.Series([2, 1, 1, 1, 1], dtype="int64"), + "col_3": [0.676125, 0.676125, 0.676125, 0.676125, 0.6], + } + ) assert_frame_equal(X_expected, X_t) def test_get_feature_names(): - X = pd.DataFrame({'col_1': [1, 2, 1, 1, 2], - 'col_2': ["r", "t", "s", "t", "t"], - 'col_3': ["a", "a", "a", "b", "a"]}) + X = pd.DataFrame( + { + "col_1": [1, 2, 1, 1, 2], + "col_2": ["r", "t", "s", "t", "t"], + "col_3": ["a", "a", "a", "b", "a"], + } + ) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder() - with pytest.raises(ComponentNotYetFittedError, match='This TargetEncoder is not fitted yet. You must fit'): + with pytest.raises( + ComponentNotYetFittedError, + match="This TargetEncoder is not fitted yet. You must fit", + ): encoder.get_feature_names() encoder.fit(X, y) - np.testing.assert_array_equal(encoder.get_feature_names(), np.array(['col_1', 'col_2', 'col_3'])) + np.testing.assert_array_equal( + encoder.get_feature_names(), np.array(["col_1", "col_2", "col_3"]) + ) -@patch('evalml.pipelines.components.transformers.transformer.Transformer.fit') +@patch("evalml.pipelines.components.transformers.transformer.Transformer.fit") def test_pandas_numpy(mock_fit, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X).sample(frac=1) @@ -180,12 +253,24 @@ def test_pandas_numpy(mock_fit, X_y_binary): encoder.fit(X_numpy, y) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.to_datetime(['20190902', '20200519', '20190607'], format='%Y%m%d')), - pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 3.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'a'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame( + pd.to_datetime(["20190902", "20200519", "20190607"], format="%Y%m%d") + ), + pd.DataFrame(pd.Series([1, 2, 3], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 3.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "a"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) def test_target_encoder_woodwork_custom_overrides_returned_by_components(X_df): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean, Datetime] diff --git a/evalml/tests/component_tests/test_target_imputer.py b/evalml/tests/component_tests/test_target_imputer.py index 8b13f56154..250b5c6ba6 100644 --- a/evalml/tests/component_tests/test_target_imputer.py +++ b/evalml/tests/component_tests/test_target_imputer.py @@ -8,7 +8,7 @@ Categorical, Double, Integer, - NaturalLanguage + NaturalLanguage, ) from evalml.pipelines.components import TargetImputer @@ -27,7 +27,7 @@ def test_target_imputer_no_y(X_y_binary): def test_target_imputer_with_X(): X = pd.DataFrame({"some col": [1, 3, np.nan]}) y = pd.Series([np.nan, 1, 3]) - imputer = TargetImputer(impute_strategy='median') + imputer = TargetImputer(impute_strategy="median") y_expected = pd.Series([2, 1, 3]) X_expected = pd.DataFrame({"some col": [1, 3, np.nan]}) X_t, y_t = imputer.fit_transform(X, y) @@ -37,7 +37,7 @@ def test_target_imputer_with_X(): def test_target_imputer_median(): y = pd.Series([np.nan, 1, 10, 10, 6]) - imputer = TargetImputer(impute_strategy='median') + imputer = TargetImputer(impute_strategy="median") y_expected = pd.Series([8, 1, 10, 10, 6]) _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) @@ -45,31 +45,40 @@ def test_target_imputer_median(): def test_target_imputer_mean(): y = pd.Series([np.nan, 2, 0]) - imputer = TargetImputer(impute_strategy='mean') + imputer = TargetImputer(impute_strategy="mean") y_expected = pd.Series([1, 2, 0]) _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) -@pytest.mark.parametrize("fill_value, y, y_expected", [(None, pd.Series([np.nan, 0, 5]), pd.Series([0, 0, 5])), - (None, pd.Series([np.nan, "a", "b"]), pd.Series(["missing_value", "a", "b"]).astype("category")), - (3, pd.Series([np.nan, 0, 5]), pd.Series([3, 0, 5])), - (3, pd.Series([np.nan, "a", "b"]), pd.Series([3, "a", "b"]).astype("category"))]) +@pytest.mark.parametrize( + "fill_value, y, y_expected", + [ + (None, pd.Series([np.nan, 0, 5]), pd.Series([0, 0, 5])), + ( + None, + pd.Series([np.nan, "a", "b"]), + pd.Series(["missing_value", "a", "b"]).astype("category"), + ), + (3, pd.Series([np.nan, 0, 5]), pd.Series([3, 0, 5])), + (3, pd.Series([np.nan, "a", "b"]), pd.Series([3, "a", "b"]).astype("category")), + ], +) def test_target_imputer_constant(fill_value, y, y_expected): - imputer = TargetImputer(impute_strategy='constant', fill_value=fill_value) + imputer = TargetImputer(impute_strategy="constant", fill_value=fill_value) _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) def test_target_imputer_most_frequent(): y = pd.Series([np.nan, "a", "b"]) - imputer = TargetImputer(impute_strategy='most_frequent') + imputer = TargetImputer(impute_strategy="most_frequent") y_expected = pd.Series(["a", "a", "b"]).astype("category") _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) y = pd.Series([np.nan, 1, 1, 2]) - imputer = TargetImputer(impute_strategy='most_frequent') + imputer = TargetImputer(impute_strategy="most_frequent") y_expected = pd.Series([1, 1, 1, 2]) _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) @@ -77,19 +86,27 @@ def test_target_imputer_most_frequent(): def test_target_imputer_col_with_non_numeric_with_numeric_strategy(): y = pd.Series([np.nan, "a", "b"]) - imputer = TargetImputer(impute_strategy='mean') - with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): + imputer = TargetImputer(impute_strategy="mean") + with pytest.raises( + ValueError, match="Cannot use mean strategy with non-numeric data" + ): imputer.fit_transform(None, y) - with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): + with pytest.raises( + ValueError, match="Cannot use mean strategy with non-numeric data" + ): imputer.fit(None, y) - imputer = TargetImputer(impute_strategy='median') - with pytest.raises(ValueError, match="Cannot use median strategy with non-numeric data"): + imputer = TargetImputer(impute_strategy="median") + with pytest.raises( + ValueError, match="Cannot use median strategy with non-numeric data" + ): imputer.fit_transform(None, y) - with pytest.raises(ValueError, match="Cannot use median strategy with non-numeric data"): + with pytest.raises( + ValueError, match="Cannot use median strategy with non-numeric data" + ): imputer.fit(None, y) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_target_imputer_all_bool_return_original(data_type, make_data_type): y = pd.Series([True, True, False, True, True], dtype=bool) y = make_data_type(data_type, y) @@ -100,10 +117,10 @@ def test_target_imputer_all_bool_return_original(data_type, make_data_type): assert_series_equal(y_expected, y_t) -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_target_imputer_boolean_dtype(data_type, make_data_type): - y = pd.Series([True, np.nan, False, np.nan, True], dtype='category') - y_expected = pd.Series([True, True, False, True, True], dtype='category') + y = pd.Series([True, np.nan, False, np.nan, True], dtype="category") + y_expected = pd.Series([True, True, False, True, True], dtype="category") y = make_data_type(data_type, y) imputer = TargetImputer() imputer.fit(None, y) @@ -126,7 +143,7 @@ def test_target_imputer_fit_transform_all_nan_empty(): def test_target_imputer_numpy_input(): y = np.array([np.nan, 0, 2]) - imputer = TargetImputer(impute_strategy='mean') + imputer = TargetImputer(impute_strategy="mean") y_expected = np.array([1, 0, 2]) _, y_t = imputer.fit_transform(None, y) assert np.allclose(y_expected, y_t) @@ -139,38 +156,72 @@ def test_target_imputer_does_not_reset_index(): assert y.index.tolist() == list(range(10)) y.drop(0, inplace=True) - pd.testing.assert_series_equal(pd.Series([1, 2, 3, 4, np.nan, 6, 7, 8, 9], dtype=float, index=list(range(1, 10))), y) + pd.testing.assert_series_equal( + pd.Series( + [1, 2, 3, 4, np.nan, 6, 7, 8, 9], dtype=float, index=list(range(1, 10)) + ), + y, + ) imputer = TargetImputer(impute_strategy="mean") imputer.fit(None, y=y) _, y_t = imputer.transform(None, y) - pd.testing.assert_series_equal(pd.Series([1.0, 2, 3, 4, 5, 6, 7, 8, 9], dtype=float, index=list(range(1, 10))), y_t) - - -@pytest.mark.parametrize("y, y_expected", [(pd.Series([1, 0, 5, None]), pd.Series([1, 0, 5, 2])), - (pd.Series([0.1, 0.0, 0.5, None]), pd.Series([0.1, 0.0, 0.5, 0.2]))]) + pd.testing.assert_series_equal( + pd.Series([1.0, 2, 3, 4, 5, 6, 7, 8, 9], dtype=float, index=list(range(1, 10))), + y_t, + ) + + +@pytest.mark.parametrize( + "y, y_expected", + [ + (pd.Series([1, 0, 5, None]), pd.Series([1, 0, 5, 2])), + (pd.Series([0.1, 0.0, 0.5, None]), pd.Series([0.1, 0.0, 0.5, 0.2])), + ], +) def test_target_imputer_with_none(y, y_expected): imputer = TargetImputer(impute_strategy="mean") _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) -@pytest.mark.parametrize("y, y_expected", [(pd.Series(["b", "a", "a", None], dtype='category'), pd.Series(["b", "a", "a", "a"], dtype='category')), - (pd.Series([True, None, False, True], dtype='category'), pd.Series([True, True, False, True], dtype='category')), - (pd.Series(["b", "a", "a", None]), pd.Series(["b", "a", "a", "a"], dtype='category'))]) +@pytest.mark.parametrize( + "y, y_expected", + [ + ( + pd.Series(["b", "a", "a", None], dtype="category"), + pd.Series(["b", "a", "a", "a"], dtype="category"), + ), + ( + pd.Series([True, None, False, True], dtype="category"), + pd.Series([True, True, False, True], dtype="category"), + ), + ( + pd.Series(["b", "a", "a", None]), + pd.Series(["b", "a", "a", "a"], dtype="category"), + ), + ], +) def test_target_imputer_with_none_non_numeric(y, y_expected): imputer = TargetImputer() _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) -@pytest.mark.parametrize("y_pd", [pd.Series([1, 2, 3], dtype="int64"), - pd.Series([1., 2., 3.], dtype="float"), - pd.Series(['a', 'b', 'a'], dtype="category"), - pd.Series([True, False, True], dtype=bool)]) +@pytest.mark.parametrize( + "y_pd", + [ + pd.Series([1, 2, 3], dtype="int64"), + pd.Series([1.0, 2.0, 3.0], dtype="float"), + pd.Series(["a", "b", "a"], dtype="category"), + pd.Series([True, False, True], dtype=bool), + ], +) @pytest.mark.parametrize("has_nan", [True, False]) @pytest.mark.parametrize("impute_strategy", ["mean", "median", "most_frequent"]) -def test_target_imputer_woodwork_custom_overrides_returned_by_components(y_pd, has_nan, impute_strategy): +def test_target_imputer_woodwork_custom_overrides_returned_by_components( + y_pd, has_nan, impute_strategy +): y_to_use = y_pd.copy() if has_nan: y_to_use[len(y_pd) - 1] = np.nan diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py index 5304af60f1..b5be53fa5c 100644 --- a/evalml/tests/component_tests/test_text_featurizer.py +++ b/evalml/tests/component_tests/test_text_featurizer.py @@ -16,16 +16,20 @@ def test_featurizer_only_text(text_df): tf = TextFeaturizer() tf.fit(X) - expected_col_names = set(['DIVERSITY_SCORE(col_1)', - 'DIVERSITY_SCORE(col_2)', - 'LSA(col_1)[0]', - 'LSA(col_1)[1]', - 'LSA(col_2)[0]', - 'LSA(col_2)[1]', - 'MEAN_CHARACTERS_PER_WORD(col_1)', - 'MEAN_CHARACTERS_PER_WORD(col_2)', - 'POLARITY_SCORE(col_1)', - 'POLARITY_SCORE(col_2)']) + expected_col_names = set( + [ + "DIVERSITY_SCORE(col_1)", + "DIVERSITY_SCORE(col_2)", + "LSA(col_1)[0]", + "LSA(col_1)[1]", + "LSA(col_2)[0]", + "LSA(col_2)[1]", + "MEAN_CHARACTERS_PER_WORD(col_1)", + "MEAN_CHARACTERS_PER_WORD(col_2)", + "POLARITY_SCORE(col_1)", + "POLARITY_SCORE(col_2)", + ] + ) X_t = tf.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 10 @@ -34,21 +38,25 @@ def test_featurizer_only_text(text_df): def test_featurizer_with_nontext(text_df): X = text_df - X['col_3'] = [73.7, 67.213, 92] + X["col_3"] = [73.7, 67.213, 92] tf = TextFeaturizer() tf.fit(X) - expected_col_names = set(['DIVERSITY_SCORE(col_1)', - 'DIVERSITY_SCORE(col_2)', - 'LSA(col_1)[0]', - 'LSA(col_1)[1]', - 'LSA(col_2)[0]', - 'LSA(col_2)[1]', - 'MEAN_CHARACTERS_PER_WORD(col_1)', - 'MEAN_CHARACTERS_PER_WORD(col_2)', - 'POLARITY_SCORE(col_1)', - 'POLARITY_SCORE(col_2)', - 'col_3']) + expected_col_names = set( + [ + "DIVERSITY_SCORE(col_1)", + "DIVERSITY_SCORE(col_2)", + "LSA(col_1)[0]", + "LSA(col_1)[1]", + "LSA(col_2)[0]", + "LSA(col_2)[1]", + "MEAN_CHARACTERS_PER_WORD(col_1)", + "MEAN_CHARACTERS_PER_WORD(col_2)", + "POLARITY_SCORE(col_1)", + "POLARITY_SCORE(col_2)", + "col_3", + ] + ) X_t = tf.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 11 @@ -56,7 +64,7 @@ def test_featurizer_with_nontext(text_df): def test_featurizer_no_text(): - X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) + X = pd.DataFrame({"col_1": [1, 2, 3], "col_2": [4, 5, 6]}) tf = TextFeaturizer() tf.fit(X) X_t = tf.transform(X) @@ -65,17 +73,21 @@ def test_featurizer_no_text(): def test_some_missing_col_names(text_df, caplog): X = text_df - tf = TextFeaturizer(text_columns=['col_1', 'col_2', 'col_3']) - expected_col_names = set(['DIVERSITY_SCORE(col_1)', - 'DIVERSITY_SCORE(col_2)', - 'LSA(col_1)[0]', - 'LSA(col_1)[1]', - 'LSA(col_2)[0]', - 'LSA(col_2)[1]', - 'MEAN_CHARACTERS_PER_WORD(col_1)', - 'MEAN_CHARACTERS_PER_WORD(col_2)', - 'POLARITY_SCORE(col_1)', - 'POLARITY_SCORE(col_2)']) + tf = TextFeaturizer(text_columns=["col_1", "col_2", "col_3"]) + expected_col_names = set( + [ + "DIVERSITY_SCORE(col_1)", + "DIVERSITY_SCORE(col_2)", + "LSA(col_1)[0]", + "LSA(col_1)[1]", + "LSA(col_2)[0]", + "LSA(col_2)[1]", + "MEAN_CHARACTERS_PER_WORD(col_1)", + "MEAN_CHARACTERS_PER_WORD(col_2)", + "POLARITY_SCORE(col_1)", + "POLARITY_SCORE(col_2)", + ] + ) tf.fit(X) X_t = tf.transform(X) assert set(X_t.columns) == expected_col_names @@ -84,10 +96,13 @@ def test_some_missing_col_names(text_df, caplog): def test_empty_text_column(): - X = pd.DataFrame({'col_1': []}) - X = infer_feature_types(X, {'col_1': 'NaturalLanguage'}) + X = pd.DataFrame({"col_1": []}) + X = infer_feature_types(X, {"col_1": "NaturalLanguage"}) tf = TextFeaturizer() - with pytest.raises(ValueError, match="empty vocabulary; perhaps the documents only contain stop words"): + with pytest.raises( + ValueError, + match="empty vocabulary; perhaps the documents only contain stop words", + ): tf.fit(X) @@ -95,27 +110,37 @@ def test_invalid_text_column(): # we assume this sort of data would fail to validate as text data up the stack # but just in case, make sure our component will convert non-str values to str X = pd.DataFrame( - {'col_1': [ - 'I\'m singing in the rain!$%^ do do do do do da do', - 'just singing in the rain.................. \n', - 325, - np.nan, - None, - 'I\'m happy again!!! lalalalalalalalalalala']}) - X = infer_feature_types(X, {'col_1': 'NaturalLanguage'}) + { + "col_1": [ + "I'm singing in the rain!$%^ do do do do do da do", + "just singing in the rain.................. \n", + 325, + np.nan, + None, + "I'm happy again!!! lalalalalalalalalalala", + ] + } + ) + X = infer_feature_types(X, {"col_1": "NaturalLanguage"}) tf = TextFeaturizer() tf.fit(X) def test_no_null_output(): X = pd.DataFrame( - {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], - 'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living Red, the blood of angry men - black, the dark of ages past', - ':)'] - }) + { + "col_1": [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + ], + "col_2": [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living Red, the blood of angry men - black, the dark of ages past", + ":)", + ], + } + ) tf = TextFeaturizer() tf.fit(X) X_t = tf.transform(X) @@ -123,22 +148,39 @@ def test_no_null_output(): def test_index_col_names(): - X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'], - ['just singing in the rain.................. \n', 'singing the songs of angry men\n'], - ['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']]) + X = np.array( + [ + [ + "I'm singing in the rain!$%^ do do do do do da do", + "do you hear the people sing?////////////////////////////////////", + ], + [ + "just singing in the rain.................. \n", + "singing the songs of angry men\n", + ], + [ + "\t\n\n\n\nWhat a glorious feelinggggggggggg, I'm happy again!!! lalalalalalalalalalala", + "\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!", + ], + ] + ) tf = TextFeaturizer() tf.fit(X) - expected_col_names = set(['DIVERSITY_SCORE(0)', - 'DIVERSITY_SCORE(1)', - 'LSA(0)[0]', - 'LSA(0)[1]', - 'LSA(1)[0]', - 'LSA(1)[1]', - 'MEAN_CHARACTERS_PER_WORD(0)', - 'MEAN_CHARACTERS_PER_WORD(1)', - 'POLARITY_SCORE(0)', - 'POLARITY_SCORE(1)']) + expected_col_names = set( + [ + "DIVERSITY_SCORE(0)", + "DIVERSITY_SCORE(1)", + "LSA(0)[0]", + "LSA(0)[1]", + "LSA(1)[0]", + "LSA(1)[1]", + "MEAN_CHARACTERS_PER_WORD(0)", + "MEAN_CHARACTERS_PER_WORD(1)", + "POLARITY_SCORE(0)", + "POLARITY_SCORE(1)", + ] + ) X_t = tf.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 10 @@ -147,25 +189,35 @@ def test_index_col_names(): def test_float_col_names(): X = pd.DataFrame( - {4.75: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], - -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living', - 'Red, the blood of angry men - black, the dark of ages past'] - }) + { + 4.75: [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + ], + -1: [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living", + "Red, the blood of angry men - black, the dark of ages past", + ], + } + ) tf = TextFeaturizer() tf.fit(X) - expected_col_names = set(['DIVERSITY_SCORE(4.75)', - 'DIVERSITY_SCORE(-1.0)', - 'LSA(4.75)[0]', - 'LSA(4.75)[1]', - 'LSA(-1.0)[0]', - 'LSA(-1.0)[1]', - 'MEAN_CHARACTERS_PER_WORD(4.75)', - 'MEAN_CHARACTERS_PER_WORD(-1.0)', - 'POLARITY_SCORE(4.75)', - 'POLARITY_SCORE(-1.0)']) + expected_col_names = set( + [ + "DIVERSITY_SCORE(4.75)", + "DIVERSITY_SCORE(-1.0)", + "LSA(4.75)[0]", + "LSA(4.75)[1]", + "LSA(-1.0)[0]", + "LSA(-1.0)[1]", + "MEAN_CHARACTERS_PER_WORD(4.75)", + "MEAN_CHARACTERS_PER_WORD(-1.0)", + "POLARITY_SCORE(4.75)", + "POLARITY_SCORE(-1.0)", + ] + ) X_t = tf.transform(X) assert set(X_t.columns) == expected_col_names assert len(X_t.columns) == 10 @@ -174,13 +226,19 @@ def test_float_col_names(): def test_output_null(): X = pd.DataFrame( - {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], - 'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living Red, the blood of angry men - black, the dark of ages past', - ':)'] - }) + { + "col_1": [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + ], + "col_2": [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living Red, the blood of angry men - black, the dark of ages past", + ":)", + ], + } + ) tf = TextFeaturizer() tf.fit(X) X_t = tf.transform(X) @@ -189,67 +247,90 @@ def test_output_null(): def test_diversity_primitive_output(): X = pd.DataFrame( - {'diverse': ['This is a very diverse string which does not contain any repeated words at all', - 'Here here each each word word is is repeated repeated exactly exactly twice twice', - 'A sentence sentence with just a little overlap here and there there there']}) + { + "diverse": [ + "This is a very diverse string which does not contain any repeated words at all", + "Here here each each word word is is repeated repeated exactly exactly twice twice", + "A sentence sentence with just a little overlap here and there there there", + ] + } + ) tf = TextFeaturizer() tf.fit(X) - expected_features = pd.Series([1.0, 0.5, 0.75], name='DIVERSITY_SCORE(diverse)') + expected_features = pd.Series([1.0, 0.5, 0.75], name="DIVERSITY_SCORE(diverse)") X_t = tf.transform(X) - features = X_t['DIVERSITY_SCORE(diverse)'] + features = X_t["DIVERSITY_SCORE(diverse)"] assert_series_equal(expected_features, features) def test_lsa_primitive_output(): X = pd.DataFrame( - {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living', - 'Red, the blood of angry men - black, the dark of ages past']}) + { + "lsa": [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living", + "Red, the blood of angry men - black, the dark of ages past", + ] + } + ) tf = TextFeaturizer() tf.fit(X) - expected_features = pd.DataFrame([[0.832, 0.], - [0., 1.], - [0.832, 0.]], columns=['LSA(lsa)[0]', 'LSA(lsa)[1]']) + expected_features = pd.DataFrame( + [[0.832, 0.0], [0.0, 1.0], [0.832, 0.0]], columns=["LSA(lsa)[0]", "LSA(lsa)[1]"] + ) X_t = tf.transform(X) - cols = [col for col in X_t.columns if 'LSA' in col] + cols = [col for col in X_t.columns if "LSA" in col] features = X_t[cols] assert_frame_equal(expected_features, features, atol=1e-3) def test_mean_characters_primitive_output(): X = pd.DataFrame( - {'mean_characters': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!']}) + { + "mean_characters": [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + ] + } + ) tf = TextFeaturizer() tf.fit(X) - expected_features = pd.Series([4.11764705882352, 3.45, 3.72727272727], name='MEAN_CHARACTERS_PER_WORD(mean_characters)') + expected_features = pd.Series( + [4.11764705882352, 3.45, 3.72727272727], + name="MEAN_CHARACTERS_PER_WORD(mean_characters)", + ) X_t = tf.transform(X) - features = X_t['MEAN_CHARACTERS_PER_WORD(mean_characters)'] + features = X_t["MEAN_CHARACTERS_PER_WORD(mean_characters)"] assert_series_equal(expected_features, features) def test_polarity_primitive_output(): X = pd.DataFrame( - {'polarity': ['This is neutral.', - 'Everything is bad. Nothing is happy, he hates milk and can\'t stand gross foods, we are being very negative.', - 'Everything is awesome! Everything is cool when you\'re part of a team! He loves milk and cookies!']}) + { + "polarity": [ + "This is neutral.", + "Everything is bad. Nothing is happy, he hates milk and can't stand gross foods, we are being very negative.", + "Everything is awesome! Everything is cool when you're part of a team! He loves milk and cookies!", + ] + } + ) tf = TextFeaturizer() tf.fit(X) - expected_features = pd.Series([0.0, -0.214, 0.602], name='POLARITY_SCORE(polarity)') + expected_features = pd.Series([0.0, -0.214, 0.602], name="POLARITY_SCORE(polarity)") X_t = tf.transform(X) - features = X_t['POLARITY_SCORE(polarity)'] + features = X_t["POLARITY_SCORE(polarity)"] assert_series_equal(expected_features, features) def test_featurizer_with_custom_indices(text_df): X = text_df X = X.set_index(pd.Series([2, 5, 19])) - tf = TextFeaturizer(text_columns=['col_1', 'col_2']) + tf = TextFeaturizer(text_columns=["col_1", "col_2"]) tf.fit(X) X_t = tf.transform(X) assert not X_t.isnull().any().any() @@ -258,19 +339,32 @@ def test_featurizer_with_custom_indices(text_df): def test_text_featurizer_does_not_modify_input_data(text_df): X = text_df expected = X.copy() - tf = TextFeaturizer(text_columns=['col_1', 'col_2']) + tf = TextFeaturizer(text_columns=["col_1", "col_2"]) _ = tf.fit_transform(X) pd.testing.assert_frame_equal(X, expected) -@pytest.mark.parametrize("X_df", [pd.DataFrame(pd.Series([1, 2, 10], dtype="Int64")), - pd.DataFrame(pd.Series([1., 2., 10.], dtype="float")), - pd.DataFrame(pd.Series(['a', 'b', 'ab'], dtype="category")), - pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), - pd.DataFrame(pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string"))]) +@pytest.mark.parametrize( + "X_df", + [ + pd.DataFrame(pd.Series([1, 2, 10], dtype="Int64")), + pd.DataFrame(pd.Series([1.0, 2.0, 10.0], dtype="float")), + pd.DataFrame(pd.Series(["a", "b", "ab"], dtype="category")), + pd.DataFrame(pd.Series([True, False, True], dtype="boolean")), + pd.DataFrame( + pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) + ), + ], +) def test_text_featurizer_woodwork_custom_overrides_returned_by_components(X_df): X_df = X_df.copy() - X_df['text col'] = pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string") + X_df["text col"] = pd.Series( + ["this will be a natural language column because length", "yay", "hay"], + dtype="string", + ) y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, Boolean] tf = TextFeaturizer() @@ -285,20 +379,28 @@ def test_text_featurizer_woodwork_custom_overrides_returned_by_components(X_df): tf.fit(X) transformed = tf.transform(X, y) assert isinstance(transformed, pd.DataFrame) - assert transformed.ww.logical_types == {0: logical_type, 'LSA(text col)[0]': Double, - 'LSA(text col)[1]': Double, - 'DIVERSITY_SCORE(text col)': Double, - 'MEAN_CHARACTERS_PER_WORD(text col)': Double, - 'POLARITY_SCORE(text col)': Double} + assert transformed.ww.logical_types == { + 0: logical_type, + "LSA(text col)[0]": Double, + "LSA(text col)[1]": Double, + "DIVERSITY_SCORE(text col)": Double, + "MEAN_CHARACTERS_PER_WORD(text col)": Double, + "POLARITY_SCORE(text col)": Double, + } @patch("featuretools.dfs") def test_text_featurizer_sets_max_depth_1(mock_dfs): X = pd.DataFrame( - {'polarity': ['This is neutral.', - 'Everything is bad. Nothing is happy, he hates milk and can\'t stand gross foods, we are being very negative.', - 'Everything is awesome! Everything is cool when you\'re part of a team! He loves milk and cookies!']}) + { + "polarity": [ + "This is neutral.", + "Everything is bad. Nothing is happy, he hates milk and can't stand gross foods, we are being very negative.", + "Everything is awesome! Everything is cool when you're part of a team! He loves milk and cookies!", + ] + } + ) tf = TextFeaturizer() tf.fit(X) _, kwargs = mock_dfs.call_args - assert kwargs['max_depth'] == 1 + assert kwargs["max_depth"] == 1 diff --git a/evalml/tests/component_tests/test_time_series_baseline_estimators.py b/evalml/tests/component_tests/test_time_series_baseline_estimators.py index b47db41a6d..1d915a8ee4 100644 --- a/evalml/tests/component_tests/test_time_series_baseline_estimators.py +++ b/evalml/tests/component_tests/test_time_series_baseline_estimators.py @@ -12,7 +12,7 @@ def test_time_series_baseline_regressor_init(): def test_time_series_baseline_gap_negative(): - with pytest.raises(ValueError, match='gap value must be a positive integer.'): + with pytest.raises(ValueError, match="gap value must be a positive integer."): TimeSeriesBaselineEstimator(gap=-1) diff --git a/evalml/tests/component_tests/test_undersampler.py b/evalml/tests/component_tests/test_undersampler.py index bd707f4097..c63d5b7d4a 100644 --- a/evalml/tests/component_tests/test_undersampler.py +++ b/evalml/tests/component_tests/test_undersampler.py @@ -10,7 +10,7 @@ def test_init(): "sampling_ratio": 1, "min_samples": 1, "min_percentage": 0.5, - "sampling_ratio_dict": None + "sampling_ratio_dict": None, } undersampler = Undersampler(**parameters) assert undersampler.parameters == parameters @@ -55,7 +55,9 @@ def test_undersample_imbalanced(data_type, make_data_type): assert len(new_y) == 750 value_counts = new_y.value_counts() assert value_counts.values[1] / value_counts.values[0] == sampling_ratio - pd.testing.assert_series_equal(value_counts, pd.Series([600, 150], index=[1, 0]), check_dtype=False) + pd.testing.assert_series_equal( + value_counts, pd.Series([600, 150], index=[1, 0]), check_dtype=False + ) transform_X, transform_y = undersampler.transform(X, y) @@ -63,10 +65,15 @@ def test_undersample_imbalanced(data_type, make_data_type): np.testing.assert_equal(None, transform_y) -@pytest.mark.parametrize("dictionary,msg", [({'majority': 0.5}, "Sampling dictionary contains a different number"), - ({'minority': 1}, "Sampling dictionary contains a different number"), - ({0: 1, 1: 0.1}, "Dictionary keys are different from"), - ({1: 0.1}, "Sampling dictionary contains a different number")]) +@pytest.mark.parametrize( + "dictionary,msg", + [ + ({"majority": 0.5}, "Sampling dictionary contains a different number"), + ({"minority": 1}, "Sampling dictionary contains a different number"), + ({0: 1, 1: 0.1}, "Dictionary keys are different from"), + ({1: 0.1}, "Sampling dictionary contains a different number"), + ], +) def test_undersampler_sampling_dict_errors(dictionary, msg): X = np.array([[i] for i in range(1000)]) y = np.array(["minority"] * 150 + ["majority"] * 850) @@ -76,11 +83,16 @@ def test_undersampler_sampling_dict_errors(dictionary, msg): undersampler.fit_transform(X, y) -@pytest.mark.parametrize("sampling_ratio_dict,expected_dict_values", [({0: 1, 1: 0.5}, {0: 150, 1: 300}), - ({0: 1, 1: 0.25}, {0: 150, 1: 600}), - ({0: 1, 1: 0.1}, {0: 150, 1: 850}), - ({0: 0.1, 1: 0.1}, {0: 150, 1: 850}), - ({0: 0.1, 1: 1}, {0: 150, 1: 150})]) +@pytest.mark.parametrize( + "sampling_ratio_dict,expected_dict_values", + [ + ({0: 1, 1: 0.5}, {0: 150, 1: 300}), + ({0: 1, 1: 0.25}, {0: 150, 1: 600}), + ({0: 1, 1: 0.1}, {0: 150, 1: 850}), + ({0: 0.1, 1: 0.1}, {0: 150, 1: 850}), + ({0: 0.1, 1: 1}, {0: 150, 1: 150}), + ], +) def test_undersampler_sampling_dict(sampling_ratio_dict, expected_dict_values): X = np.array([[i] for i in range(1000)]) y = np.array([0] * 150 + [1] * 850) diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index 0396d66a70..9fc293a64a 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -9,7 +9,7 @@ from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, - RegressionPipeline + RegressionPipeline, ) from evalml.pipelines.components import ComponentBase, RandomForestClassifier from evalml.pipelines.components.utils import ( @@ -17,7 +17,7 @@ all_components, handle_component_class, make_balancing_dictionary, - scikit_learn_wrapped_estimator + scikit_learn_wrapped_estimator, ) from evalml.problem_types import ProblemTypes @@ -25,7 +25,9 @@ multiclass = pd.Series([0] * 800 + [1] * 150 + [2] * 50) -def test_all_components(has_minimal_dependencies, is_running_py_39_or_above, is_using_conda): +def test_all_components( + has_minimal_dependencies, is_running_py_39_or_above, is_using_conda +): if has_minimal_dependencies: n_components = 37 elif is_using_conda: @@ -46,12 +48,16 @@ def test_handle_component_class_names(): assert inspect.isclass(name_ret) assert issubclass(name_ret, ComponentBase) - invalid_name = 'This Component Does Not Exist' - with pytest.raises(MissingComponentError, match='Component "This Component Does Not Exist" was not found'): + invalid_name = "This Component Does Not Exist" + with pytest.raises( + MissingComponentError, + match='Component "This Component Does Not Exist" was not found', + ): handle_component_class(invalid_name) class NonComponent: pass + with pytest.raises(ValueError): handle_component_class(NonComponent()) @@ -59,12 +65,18 @@ class NonComponent: def test_scikit_learn_wrapper_invalid_problem_type(): evalml_pipeline = MulticlassClassificationPipeline([RandomForestClassifier]) evalml_pipeline.problem_type = None - with pytest.raises(ValueError, match="Could not wrap EvalML object in scikit-learn wrapper."): + with pytest.raises( + ValueError, match="Could not wrap EvalML object in scikit-learn wrapper." + ): scikit_learn_wrapped_estimator(evalml_pipeline) def test_scikit_learn_wrapper(X_y_binary, X_y_multi, X_y_regression, ts_data): - for estimator in [estimator for estimator in _all_estimators() if estimator.model_family != ModelFamily.ENSEMBLE]: + for estimator in [ + estimator + for estimator in _all_estimators() + if estimator.model_family != ModelFamily.ENSEMBLE + ]: for problem_type in estimator.supported_problem_types: if problem_type == ProblemTypes.BINARY: X, y = X_y_binary @@ -78,7 +90,11 @@ def test_scikit_learn_wrapper(X_y_binary, X_y_multi, X_y_regression, ts_data): X, y = X_y_regression pipeline_class = RegressionPipeline - elif problem_type in [ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_BINARY]: + elif problem_type in [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ]: continue evalml_pipeline = pipeline_class([estimator]) @@ -107,16 +123,20 @@ def test_make_balancing_dictionary_errors(): make_balancing_dictionary(pd.Series([]), 0.5) -@pytest.mark.parametrize("y,sampling_ratio,result", - [(binary, 1, {0: 800, 1: 800}), - (binary, 0.5, {0: 800, 1: 400}), - (binary, 0.25, {0: 800, 1: 200}), - (binary, 0.1, {0: 800, 1: 200}), - (multiclass, 1, {0: 800, 1: 800, 2: 800}), - (multiclass, 0.5, {0: 800, 1: 400, 2: 400}), - (multiclass, 0.25, {0: 800, 1: 200, 2: 200}), - (multiclass, 0.1, {0: 800, 1: 150, 2: 80}), - (multiclass, 0.01, {0: 800, 1: 150, 2: 50})]) +@pytest.mark.parametrize( + "y,sampling_ratio,result", + [ + (binary, 1, {0: 800, 1: 800}), + (binary, 0.5, {0: 800, 1: 400}), + (binary, 0.25, {0: 800, 1: 200}), + (binary, 0.1, {0: 800, 1: 200}), + (multiclass, 1, {0: 800, 1: 800, 2: 800}), + (multiclass, 0.5, {0: 800, 1: 400, 2: 400}), + (multiclass, 0.25, {0: 800, 1: 200, 2: 200}), + (multiclass, 0.1, {0: 800, 1: 150, 2: 80}), + (multiclass, 0.01, {0: 800, 1: 150, 2: 50}), + ], +) def test_make_balancing_dictionary(y, sampling_ratio, result): dic = make_balancing_dictionary(y, sampling_ratio) assert dic == result diff --git a/evalml/tests/component_tests/test_xgboost_classifier.py b/evalml/tests/component_tests/test_xgboost_classifier.py index eacb168902..798dbfadd6 100644 --- a/evalml/tests/component_tests/test_xgboost_classifier.py +++ b/evalml/tests/component_tests/test_xgboost_classifier.py @@ -9,7 +9,7 @@ from evalml.problem_types import ProblemTypes from evalml.utils import SEED_BOUNDS, get_random_state -xgb = importorskip('xgboost', reason='Skipping test because xgboost not installed') +xgb = importorskip("xgboost", reason="Skipping test because xgboost not installed") def test_xgboost_classifier_random_seed_bounds_seed(X_y_binary): @@ -18,10 +18,14 @@ def test_xgboost_classifier_random_seed_bounds_seed(X_y_binary): col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) y = pd.Series(y) - clf = XGBoostClassifier(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound) + clf = XGBoostClassifier( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound + ) fitted = clf.fit(X, y) assert isinstance(fitted, XGBoostClassifier) - clf = XGBoostClassifier(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound) + clf = XGBoostClassifier( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound + ) clf.fit(X, y) @@ -37,7 +41,7 @@ def test_xgboost_feature_name_with_random_ascii(problem_type, X_y_binary, X_y_mu expected_cols = 3 X = get_random_state(clf.random_seed).random((X.shape[0], len(string.printable))) - col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable] + col_names = ["column_{}".format(ascii_char) for ascii_char in string.printable] X = pd.DataFrame(X, columns=col_names) clf.fit(X, y) @@ -53,11 +57,13 @@ def test_xgboost_feature_name_with_random_ascii(problem_type, X_y_binary, X_y_mu assert not np.isnan(clf.feature_importance).all().all() -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_xgboost_multiindex(data_type, X_y_binary, make_data_type): X, y = X_y_binary X = pd.DataFrame(X) - col_names = [('column_{}'.format(num), '{}'.format(num)) for num in range(len(X.columns))] + col_names = [ + ("column_{}".format(num), "{}".format(num)) for num in range(len(X.columns)) + ] X.columns = pd.MultiIndex.from_tuples(col_names) X = make_data_type(data_type, X) y = make_data_type(data_type, y) diff --git a/evalml/tests/component_tests/test_xgboost_regressor.py b/evalml/tests/component_tests/test_xgboost_regressor.py index c3af096922..91eee18c9e 100644 --- a/evalml/tests/component_tests/test_xgboost_regressor.py +++ b/evalml/tests/component_tests/test_xgboost_regressor.py @@ -8,7 +8,7 @@ from evalml.pipelines.components import XGBoostRegressor from evalml.utils import SEED_BOUNDS, get_random_state -xgb = importorskip('xgboost', reason='Skipping test because xgboost not installed') +xgb = importorskip("xgboost", reason="Skipping test because xgboost not installed") def test_xgboost_regressor_random_seed_bounds_seed(X_y_regression): @@ -17,10 +17,14 @@ def test_xgboost_regressor_random_seed_bounds_seed(X_y_regression): col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) y = pd.Series(y) - clf = XGBoostRegressor(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound) + clf = XGBoostRegressor( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.min_bound + ) fitted = clf.fit(X, y) assert isinstance(fitted, XGBoostRegressor) - clf = XGBoostRegressor(n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound) + clf = XGBoostRegressor( + n_estimators=1, max_depth=1, random_seed=SEED_BOUNDS.max_bound + ) clf.fit(X, y) @@ -28,7 +32,7 @@ def test_xgboost_feature_name_with_random_ascii(X_y_regression): X, y = X_y_regression clf = XGBoostRegressor() X = get_random_state(clf.random_seed).random((X.shape[0], len(string.printable))) - col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable] + col_names = ["column_{}".format(ascii_char) for ascii_char in string.printable] X = pd.DataFrame(X, columns=col_names) clf.fit(X, y) predictions = clf.predict(X) @@ -39,11 +43,13 @@ def test_xgboost_feature_name_with_random_ascii(X_y_regression): assert not np.isnan(clf.feature_importance).all().all() -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_xgboost_multiindex(data_type, X_y_regression, make_data_type): X, y = X_y_regression X = pd.DataFrame(X) - col_names = [('column_{}'.format(num), '{}'.format(num)) for num in range(len(X.columns))] + col_names = [ + ("column_{}".format(num), "{}".format(num)) for num in range(len(X.columns)) + ] X.columns = pd.MultiIndex.from_tuples(col_names) X = make_data_type(data_type, X) y = make_data_type(data_type, y) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 76afdf8b17..bff5409ec9 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -10,17 +10,14 @@ from evalml.demos import load_fraud from evalml.model_family import ModelFamily -from evalml.objectives.utils import ( - get_core_objectives, - get_non_core_objectives -) +from evalml.objectives.utils import get_core_objectives, get_non_core_objectives from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, - TimeSeriesRegressionPipeline + TimeSeriesRegressionPipeline, ) from evalml.pipelines.components import ( DecisionTreeClassifier, @@ -28,10 +25,10 @@ Estimator, LogisticRegressionClassifier, StackedEnsembleClassifier, - StackedEnsembleRegressor + StackedEnsembleRegressor, ) from evalml.pipelines.components.ensemble.stacked_ensemble_base import ( - _nonstackable_model_families + _nonstackable_model_families, ) from evalml.pipelines.components.utils import _all_estimators from evalml.problem_types import ProblemTypes, handle_problem_types @@ -39,70 +36,118 @@ def create_mock_pipeline(estimator, problem_type): if problem_type == ProblemTypes.BINARY: + class MockBinaryPipelineWithOnlyEstimator(BinaryClassificationPipeline): custom_name = f"Pipeline with {estimator.name}" component_graph = [estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) + return MockBinaryPipelineWithOnlyEstimator elif problem_type == ProblemTypes.MULTICLASS: + class MockMulticlassPipelineWithOnlyEstimator(MulticlassClassificationPipeline): custom_name = f"Pipeline with {estimator.name}" component_graph = [estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) return MockMulticlassPipelineWithOnlyEstimator elif problem_type == ProblemTypes.REGRESSION: + class MockRegressionPipelineWithOnlyEstimator(RegressionPipeline): custom_name = f"Pipeline with {estimator.name}" component_graph = [estimator] + return MockRegressionPipelineWithOnlyEstimator elif problem_type == ProblemTypes.TIME_SERIES_REGRESSION: + class MockTSRegressionPipelineWithOnlyEstimator(TimeSeriesRegressionPipeline): custom_name = f"Pipeline with {estimator.name}" component_graph = [estimator] + return MockTSRegressionPipelineWithOnlyEstimator elif problem_type == ProblemTypes.TIME_SERIES_BINARY: - class MockTSRegressionPipelineWithOnlyEstimator(TimeSeriesBinaryClassificationPipeline): + + class MockTSRegressionPipelineWithOnlyEstimator( + TimeSeriesBinaryClassificationPipeline + ): custom_name = f"Pipeline with {estimator.name}" component_graph = [estimator] + return MockTSRegressionPipelineWithOnlyEstimator elif problem_type == ProblemTypes.TIME_SERIES_MULTICLASS: - class MockTSRegressionPipelineWithOnlyEstimator(TimeSeriesMulticlassClassificationPipeline): + + class MockTSRegressionPipelineWithOnlyEstimator( + TimeSeriesMulticlassClassificationPipeline + ): custom_name = f"Pipeline with {estimator.name}" component_graph = [estimator] + return MockTSRegressionPipelineWithOnlyEstimator @pytest.fixture def all_pipeline_classes(): all_possible_pipeline_classes = [] - for estimator in [estimator for estimator in _all_estimators() if estimator != StackedEnsembleClassifier and estimator != StackedEnsembleRegressor]: + for estimator in [ + estimator + for estimator in _all_estimators() + if estimator != StackedEnsembleClassifier + and estimator != StackedEnsembleRegressor + ]: for problem_type in estimator.supported_problem_types: - all_possible_pipeline_classes.append(create_mock_pipeline(estimator, problem_type)) + all_possible_pipeline_classes.append( + create_mock_pipeline(estimator, problem_type) + ) return all_possible_pipeline_classes @pytest.fixture def all_binary_pipeline_classes(all_pipeline_classes): - return [pipeline_class for pipeline_class in all_pipeline_classes if issubclass(pipeline_class, BinaryClassificationPipeline)] + return [ + pipeline_class + for pipeline_class in all_pipeline_classes + if issubclass(pipeline_class, BinaryClassificationPipeline) + ] @pytest.fixture def all_multiclass_pipeline_classes(all_pipeline_classes): - return [pipeline_class for pipeline_class in all_pipeline_classes if issubclass(pipeline_class, MulticlassClassificationPipeline)] + return [ + pipeline_class + for pipeline_class in all_pipeline_classes + if issubclass(pipeline_class, MulticlassClassificationPipeline) + ] def pytest_addoption(parser): - parser.addoption("--has-minimal-dependencies", action="store_true", default=False, - help="If true, tests will assume only the dependencies in" - "core-requirements.txt have been installed.") - parser.addoption("--is-using-conda", action="store_true", default=False, - help="If true, tests will assume that they are being run as part of" - "the build_conda_pkg workflow with the feedstock.") + parser.addoption( + "--has-minimal-dependencies", + action="store_true", + default=False, + help="If true, tests will assume only the dependencies in" + "core-requirements.txt have been installed.", + ) + parser.addoption( + "--is-using-conda", + action="store_true", + default=False, + help="If true, tests will assume that they are being run as part of" + "the build_conda_pkg workflow with the feedstock.", + ) @pytest.fixture @@ -122,9 +167,17 @@ def is_running_py_39_or_above(): @pytest.fixture def assert_allowed_pipelines_equal_helper(): - def assert_allowed_pipelines_equal_helper(actual_allowed_pipelines, expected_allowed_pipelines): - for actual, expected in zip(actual_allowed_pipelines, expected_allowed_pipelines): - for pipeline_subclass in [BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline]: + def assert_allowed_pipelines_equal_helper( + actual_allowed_pipelines, expected_allowed_pipelines + ): + for actual, expected in zip( + actual_allowed_pipelines, expected_allowed_pipelines + ): + for pipeline_subclass in [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + ]: if isinstance(expected, pipeline_subclass): assert isinstance(expected, pipeline_subclass) break @@ -132,35 +185,45 @@ def assert_allowed_pipelines_equal_helper(actual_allowed_pipelines, expected_all assert actual.name == expected.name assert actual.problem_type == expected.problem_type assert actual.component_graph == expected.component_graph + return assert_allowed_pipelines_equal_helper @pytest.fixture def X_y_binary(): - X, y = datasets.make_classification(n_samples=100, n_features=20, - n_informative=2, n_redundant=2, random_state=0) + X, y = datasets.make_classification( + n_samples=100, n_features=20, n_informative=2, n_redundant=2, random_state=0 + ) return X, y @pytest.fixture(scope="class") def X_y_binary_cls(request): - X, y = datasets.make_classification(n_samples=100, n_features=20, - n_informative=2, n_redundant=2, random_state=0) + X, y = datasets.make_classification( + n_samples=100, n_features=20, n_informative=2, n_redundant=2, random_state=0 + ) request.cls.X_y_binary = pd.DataFrame(X), pd.Series(y) @pytest.fixture def X_y_regression(): - X, y = datasets.make_regression(n_samples=100, n_features=20, - n_informative=3, random_state=0) + X, y = datasets.make_regression( + n_samples=100, n_features=20, n_informative=3, random_state=0 + ) return X, y @pytest.fixture def X_y_multi(): - X, y = datasets.make_classification(n_samples=100, n_features=20, n_classes=3, - n_informative=3, n_redundant=2, random_state=0) + X, y = datasets.make_classification( + n_samples=100, + n_features=20, + n_classes=3, + n_informative=3, + n_redundant=2, + random_state=0, + ) return X, y @@ -169,11 +232,11 @@ def X_y_categorical_regression(): data_path = os.path.join(os.path.dirname(__file__), "data/tips.csv") flights = pd.read_csv(data_path) - y = flights['tip'] - X = flights.drop('tip', axis=1) + y = flights["tip"] + X = flights.drop("tip", axis=1) # add categorical dtype - X['smoker'] = X['smoker'].astype('category') + X["smoker"] = X["smoker"].astype("category") return X, y @@ -182,21 +245,27 @@ def X_y_categorical_classification(): data_path = os.path.join(os.path.dirname(__file__), "data/titanic.csv") titanic = pd.read_csv(data_path) - y = titanic['Survived'] - X = titanic.drop(['Survived', 'Name'], axis=1) + y = titanic["Survived"] + X = titanic.drop(["Survived", "Name"], axis=1) return X, y @pytest.fixture() def text_df(): df = pd.DataFrame( - {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', - 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', - 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], - 'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', - 'I dreamed a dream in days gone by, when hope was high and life worth living', - 'Red, the blood of angry men - black, the dark of ages past'] - }) + { + "col_1": [ + "I'm singing in the rain! Just singing in the rain, what a glorious feeling, I'm happy again!", + "In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.", + "I'm gonna be the main event, like no king was before! I'm brushing up on looking down, I'm working on my ROAR!", + ], + "col_2": [ + "do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!", + "I dreamed a dream in days gone by, when hope was high and life worth living", + "Red, the blood of angry men - black, the dark of ages past", + ], + } + ) yield df @@ -212,37 +281,43 @@ def ts_data(): def ts_data_seasonal(): sine_ = np.linspace(-np.pi * 5, np.pi * 5, 500) X, y = pd.DataFrame({"features": range(500)}), pd.Series(sine_) - y.index = pd.date_range(start='1/1/2018', periods=500) - X.index = pd.date_range(start='1/1/2018', periods=500) + y.index = pd.date_range(start="1/1/2018", periods=500) + X.index = pd.date_range(start="1/1/2018", periods=500) return X, y @pytest.fixture def dummy_pipeline_hyperparameters(): - return {'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': ['option a', 'option b', 'option c'], - 'param d': ['option a', 'option b', 100, np.inf] - }} + return { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": ["option a", "option b", "option c"], + "param d": ["option a", "option b", 100, np.inf], + } + } @pytest.fixture def dummy_pipeline_hyperparameters_unicode(): - return {'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': ['option a 💩', 'option b 💩', 'option c 💩'], - 'param d': ['option a', 'option b', 100, np.inf] - }} + return { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": ["option a 💩", "option b 💩", "option c 💩"], + "param d": ["option a", "option b", 100, np.inf], + } + } @pytest.fixture def dummy_pipeline_hyperparameters_small(): - return {'Mock Classifier': { - 'param a': ['most_frequent', 'median', 'mean'], - 'param b': ['a', 'b', 'c'] - }} + return { + "Mock Classifier": { + "param a": ["most_frequent", "median", "mean"], + "param b": ["a", "b", "c"], + } + } @pytest.fixture @@ -250,13 +325,18 @@ def dummy_classifier_estimator_class(): class MockEstimator(Estimator): name = "Mock Classifier" model_family = ModelFamily.NONE - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_BINARY] - hyperparameter_ranges = {'a': Integer(0, 10), - 'b': Real(0, 10)} + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ] + hyperparameter_ranges = {"a": Integer(0, 10), "b": Real(0, 10)} def __init__(self, a=1, b=0, random_seed=0): - super().__init__(parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed) + super().__init__( + parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed + ) def fit(self, X, y): return self @@ -274,13 +354,19 @@ class MockBinaryClassificationPipeline(BinaryClassificationPipeline): component_graph = [MockEstimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return MockBinaryClassificationPipeline @@ -294,13 +380,19 @@ class MockMulticlassClassificationPipeline(MulticlassClassificationPipeline): custom_name = None def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return MockMulticlassClassificationPipeline @@ -310,11 +402,12 @@ class MockRegressor(Estimator): name = "Mock Regressor" model_family = ModelFamily.NONE supported_problem_types = [ProblemTypes.REGRESSION] - hyperparameter_ranges = {'a': Integer(0, 10), - 'b': Real(0, 10)} + hyperparameter_ranges = {"a": Integer(0, 10), "b": Real(0, 10)} def __init__(self, a=1, b=0, random_seed=0): - super().__init__(parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed) + super().__init__( + parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed + ) def fit(self, X, y): return self @@ -331,13 +424,19 @@ class MockRegressionPipeline(RegressionPipeline): custom_name = "Mock Regression Pipeline" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return MockRegressionPipeline @@ -347,17 +446,20 @@ class MockTimeSeriesRegressor(Estimator): name = "Mock Time Series Regressor" model_family = ModelFamily.NONE supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION] - hyperparameter_ranges = {'a': Integer(0, 10), - 'b': Real(0, 10)} + hyperparameter_ranges = {"a": Integer(0, 10), "b": Real(0, 10)} def __init__(self, a=1, b=0, random_seed=0): - super().__init__(parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed) + super().__init__( + parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed + ) return MockTimeSeriesRegressor @pytest.fixture -def dummy_time_series_regression_pipeline_class(dummy_time_series_regressor_estimator_class): +def dummy_time_series_regression_pipeline_class( + dummy_time_series_regressor_estimator_class, +): MockTimeSeriesRegressor = dummy_time_series_regressor_estimator_class class MockTimeSeriesRegressionPipeline(TimeSeriesRegressionPipeline): @@ -365,7 +467,12 @@ class MockTimeSeriesRegressionPipeline(TimeSeriesRegressionPipeline): custom_name = None def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) return MockTimeSeriesRegressionPipeline @@ -379,7 +486,9 @@ class MockBinaryClassificationPipeline(TimeSeriesBinaryClassificationPipeline): component_graph = [MockEstimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) return MockBinaryClassificationPipeline @@ -388,25 +497,47 @@ def __init__(self, parameters, random_seed=0): def logistic_regression_multiclass_pipeline_class(): class LogisticRegressionMulticlassPipeline(MulticlassClassificationPipeline): """Logistic Regression Pipeline for binary classification.""" + custom_name = "Logistic Regression Multiclass Pipeline" - component_graph = ['Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'] + component_graph = [ + "Imputer", + "One Hot Encoder", + "Standard Scaler", + "Logistic Regression Classifier", + ] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return LogisticRegressionMulticlassPipeline @pytest.fixture def logistic_regression_binary_pipeline_class(): class LogisticRegressionBinaryPipeline(BinaryClassificationPipeline): - component_graph = ['Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'] + component_graph = [ + "Imputer", + "One Hot Encoder", + "Standard Scaler", + "Logistic Regression Classifier", + ] custom_name = "Logistic Regression Binary Pipeline" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) @@ -421,17 +552,29 @@ def clone(self): def linear_regression_pipeline_class(): class LinearRegressionPipeline(RegressionPipeline): """Linear Regression Pipeline for regression problems.""" - component_graph = ['One Hot Encoder', 'Imputer', 'Standard Scaler', 'Linear Regressor'] + + component_graph = [ + "One Hot Encoder", + "Imputer", + "Standard Scaler", + "Linear Regressor", + ] custom_name = "Linear Regression Pipeline" def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return LinearRegressionPipeline @@ -443,7 +586,9 @@ def dummy_stacked_ensemble_binary_estimator(logistic_regression_binary_pipeline_ @pytest.fixture -def dummy_stacked_ensemble_multiclass_estimator(logistic_regression_multiclass_pipeline_class): +def dummy_stacked_ensemble_multiclass_estimator( + logistic_regression_multiclass_pipeline_class, +): p1 = logistic_regression_multiclass_pipeline_class({}) ensemble_estimator = StackedEnsembleClassifier(input_pipelines=[p1], random_seed=0) return ensemble_estimator @@ -460,10 +605,13 @@ def dummy_stacked_ensemble_regressor_estimator(linear_regression_pipeline_class) def time_series_regression_pipeline_class(): class TSRegressionPipeline(TimeSeriesRegressionPipeline): """Random Forest Regression Pipeline for time series regression problems.""" - component_graph = ['Delayed Feature Transformer', 'Random Forest Regressor'] + + component_graph = ["Delayed Feature Transformer", "Random Forest Regressor"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) return TSRegressionPipeline @@ -472,10 +620,16 @@ def __init__(self, parameters, random_seed=0): def time_series_binary_classification_pipeline_class(): class TSBinaryPipeline(TimeSeriesBinaryClassificationPipeline): """Logistic Regression Pipeline for time series binary classification problems.""" - component_graph = ['Delayed Feature Transformer', 'Logistic Regression Classifier'] + + component_graph = [ + "Delayed Feature Transformer", + "Logistic Regression Classifier", + ] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) return TSBinaryPipeline @@ -484,17 +638,30 @@ def __init__(self, parameters, random_seed=0): def time_series_multiclass_classification_pipeline_class(): class TSMultiPipeline(TimeSeriesMulticlassClassificationPipeline): """Logistic Regression Pipeline for time series multiclass classification problems.""" - component_graph = ['Delayed Feature Transformer', 'Logistic Regression Classifier'] + + component_graph = [ + "Delayed Feature Transformer", + "Logistic Regression Classifier", + ] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) return TSMultiPipeline @pytest.fixture def decision_tree_classification_pipeline_class(X_y_categorical_classification): - pipeline = BinaryClassificationPipeline(['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Decision Tree Classifier']) + pipeline = BinaryClassificationPipeline( + [ + "Simple Imputer", + "One Hot Encoder", + "Standard Scaler", + "Decision Tree Classifier", + ] + ) X, y = X_y_categorical_classification pipeline.fit(X, y) return pipeline @@ -505,22 +672,31 @@ def nonlinear_binary_pipeline_class(): class NonLinearBinaryPipeline(BinaryClassificationPipeline): custom_name = "Non Linear Binary Pipeline" component_graph = { - 'Imputer': ['Imputer'], - 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], - 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], - 'Elastic Net': ['Elastic Net Classifier', 'OneHot_ElasticNet.x'], - 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net'] + "Imputer": ["Imputer"], + "OneHot_RandomForest": ["One Hot Encoder", "Imputer.x"], + "OneHot_ElasticNet": ["One Hot Encoder", "Imputer.x"], + "Random Forest": ["Random Forest Classifier", "OneHot_RandomForest.x"], + "Elastic Net": ["Elastic Net Classifier", "OneHot_ElasticNet.x"], + "Logistic Regression": [ + "Logistic Regression Classifier", + "Random Forest", + "Elastic Net", + ], } def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + ) def new(self, parameters, random_seed=0): return self.__class__(parameters, random_seed=random_seed) def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return NonLinearBinaryPipeline @@ -528,12 +704,16 @@ def clone(self): def nonlinear_multiclass_pipeline_class(): class NonLinearMulticlassPipeline(MulticlassClassificationPipeline): component_graph = { - 'Imputer': ['Imputer'], - 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], - 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], - 'Elastic Net': ['Elastic Net Classifier', 'OneHot_ElasticNet.x'], - 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net'] + "Imputer": ["Imputer"], + "OneHot_RandomForest": ["One Hot Encoder", "Imputer.x"], + "OneHot_ElasticNet": ["One Hot Encoder", "Imputer.x"], + "Random Forest": ["Random Forest Classifier", "OneHot_RandomForest.x"], + "Elastic Net": ["Elastic Net Classifier", "OneHot_ElasticNet.x"], + "Logistic Regression": [ + "Logistic Regression Classifier", + "Random Forest", + "Elastic Net", + ], } def __init__(self, parameters, random_seed=0): @@ -544,6 +724,7 @@ def new(self, parameters, random_seed=0): def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return NonLinearMulticlassPipeline @@ -551,11 +732,11 @@ def clone(self): def nonlinear_regression_pipeline_class(): class NonLinearRegressionPipeline(RegressionPipeline): component_graph = { - 'Imputer': ['Imputer'], - 'OneHot': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Regressor', 'OneHot.x'], - 'Elastic Net': ['Elastic Net Regressor', 'OneHot.x'], - 'Linear Regressor': ['Linear Regressor', 'Random Forest', 'Elastic Net'] + "Imputer": ["Imputer"], + "OneHot": ["One Hot Encoder", "Imputer.x"], + "Random Forest": ["Random Forest Regressor", "OneHot.x"], + "Elastic Net": ["Elastic Net Regressor", "OneHot.x"], + "Linear Regressor": ["Linear Regressor", "Random Forest", "Elastic Net"], } def __init__(self, parameters, random_seed=0): @@ -566,6 +747,7 @@ def new(self, parameters, random_seed=0): def clone(self): return self.__class__(self.parameters, random_seed=self.random_seed) + return NonLinearRegressionPipeline @@ -591,13 +773,18 @@ def time_series_core_objectives(): @pytest.fixture def time_series_non_core_objectives(): - non_core_time_series = [obj_() for obj_ in get_non_core_objectives() - if ProblemTypes.TIME_SERIES_REGRESSION in obj_.problem_types] + non_core_time_series = [ + obj_() + for obj_ in get_non_core_objectives() + if ProblemTypes.TIME_SERIES_REGRESSION in obj_.problem_types + ] return non_core_time_series @pytest.fixture -def time_series_objectives(time_series_core_objectives, time_series_non_core_objectives): +def time_series_objectives( + time_series_core_objectives, time_series_non_core_objectives +): return time_series_core_objectives + time_series_non_core_objectives @@ -605,11 +792,20 @@ def time_series_objectives(time_series_core_objectives, time_series_non_core_obj def stackable_classifiers(helper_functions): stackable_classifiers = [] for estimator_class in _all_estimators(): - supported_problem_types = [handle_problem_types(pt) for pt in estimator_class.supported_problem_types] - if (set(supported_problem_types) == {ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS} and - estimator_class.model_family not in _nonstackable_model_families and - estimator_class.model_family != ModelFamily.ENSEMBLE): + supported_problem_types = [ + handle_problem_types(pt) for pt in estimator_class.supported_problem_types + ] + if ( + set(supported_problem_types) + == { + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + } + and estimator_class.model_family not in _nonstackable_model_families + and estimator_class.model_family != ModelFamily.ENSEMBLE + ): stackable_classifiers.append(estimator_class) return stackable_classifiers @@ -618,10 +814,15 @@ def stackable_classifiers(helper_functions): def stackable_regressors(helper_functions): stackable_regressors = [] for estimator_class in _all_estimators(): - supported_problem_types = [handle_problem_types(pt) for pt in estimator_class.supported_problem_types] - if (set(supported_problem_types) == {ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION} and - estimator_class.model_family not in _nonstackable_model_families and - estimator_class.model_family != ModelFamily.ENSEMBLE): + supported_problem_types = [ + handle_problem_types(pt) for pt in estimator_class.supported_problem_types + ] + if ( + set(supported_problem_types) + == {ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION} + and estimator_class.model_family not in _nonstackable_model_families + and estimator_class.model_family != ModelFamily.ENSEMBLE + ): stackable_regressors.append(estimator_class) return stackable_regressors @@ -638,8 +839,8 @@ def fitted_tree_estimators(tree_estimators, X_y_binary, X_y_regression): est_clf, est_reg = tree_estimators X_b, y_b = X_y_binary X_r, y_r = X_y_regression - X_b = pd.DataFrame(X_b, columns=[f'Testing_{col}' for col in range(len(X_b[0]))]) - X_r = pd.DataFrame(X_r, columns=[f'Testing_{col}' for col in range(len(X_r[0]))]) + X_b = pd.DataFrame(X_b, columns=[f"Testing_{col}" for col in range(len(X_b[0]))]) + X_r = pd.DataFrame(X_r, columns=[f"Testing_{col}" for col in range(len(X_r[0]))]) est_clf.fit(X_b, y_b) est_reg.fit(X_r, y_r) return est_clf, est_reg @@ -666,8 +867,10 @@ def safe_init_component_with_njobs_1(component_class): def safe_init_pipeline_with_njobs_1(pipeline_class): try: estimator = pipeline_class.component_graph[-1] - estimator_name = estimator if isinstance(estimator, str) else estimator.name - pl = pipeline_class({estimator_name: {'n_jobs': 1}}) + estimator_name = ( + estimator if isinstance(estimator, str) else estimator.name + ) + pl = pipeline_class({estimator_name: {"n_jobs": 1}}) except ValueError: pl = pipeline_class({}) return pl @@ -678,6 +881,7 @@ def safe_init_pipeline_with_njobs_1(pipeline_class): @pytest.fixture def make_data_type(): """Helper function to convert numpy or pandas input to the appropriate type for tests.""" + def _make_data_type(data_type, data): if data_type == "li": if isinstance(data, pd.DataFrame): @@ -702,15 +906,16 @@ def _make_data_type(data_type, data): @pytest.fixture def fraud_100(): X, y = load_fraud(n_rows=100) - X.ww.set_types(logical_types={'provider': 'Categorical', 'region': 'Categorical'}) + X.ww.set_types(logical_types={"provider": "Categorical", "region": "Categorical"}) return X, y @pytest.fixture def mock_imbalanced_data_X_y(): """Helper function to return an imbalanced binary or multiclass dataset""" + def _imbalanced_data_X_y(problem_type, categorical_columns, size): - """"Generates a dummy classification dataset with particular amounts of class imbalance and categorical input columns. + """ "Generates a dummy classification dataset with particular amounts of class imbalance and categorical input columns. For our targets, we maintain a 1:5, or 0.2, class ratio of minority : majority. We only generate minimum amount for X to set the logical_types, so the length of X and y will be different. @@ -719,22 +924,31 @@ def _imbalanced_data_X_y(problem_type, categorical_columns, size): categorical_columns (str): Determines how many categorical cols to use. Either 'all', 'some', or 'none'. size (str): Either 'large' or 'small'. 'large' returns a dataset of size 21,000, while 'small' returns a size of 4200 """ - multiplier = 5 if size == 'large' else 1 + multiplier = 5 if size == "large" else 1 col_names = [f"col_{i}" for i in range(100)] # generate X to be all int values - X_dict = {col_name: [i % (j + 1) for i in range(1, 100)] for j, col_name in enumerate(col_names)} + X_dict = { + col_name: [i % (j + 1) for i in range(1, 100)] + for j, col_name in enumerate(col_names) + } X = pd.DataFrame(X_dict) - if categorical_columns == 'all': + if categorical_columns == "all": X.ww.init(logical_types={col_name: "Categorical" for col_name in col_names}) - elif categorical_columns == 'some': - X.ww.init(logical_types={col_name: "Categorical" for col_name in col_names[: len(col_names) // 2]}) + elif categorical_columns == "some": + X.ww.init( + logical_types={ + col_name: "Categorical" + for col_name in col_names[: len(col_names) // 2] + } + ) else: X.ww.init() - if problem_type == 'binary': + if problem_type == "binary": targets = [0] * 3500 + [1] * 700 else: targets = [0] * 3000 + [1] * 600 + [2] * 600 targets *= multiplier y = ww.init_series(pd.Series(targets)) return X, y + return _imbalanced_data_X_y diff --git a/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py b/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py index 763a056a41..982fd6bd72 100644 --- a/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py +++ b/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py @@ -7,7 +7,7 @@ ClassImbalanceDataCheck, DataCheckError, DataCheckMessageCode, - DataCheckWarning + DataCheckWarning, ) class_imbalance_data_check_name = ClassImbalanceDataCheck.name @@ -52,44 +52,70 @@ def test_class_imbalance_data_check_binary(input_type): y_balanced = ww.init_series(y_balanced) class_imbalance_check = ClassImbalanceDataCheck(min_samples=1, num_cv_folds=0) - assert class_imbalance_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} + assert class_imbalance_check.validate(X, y) == { + "warnings": [], + "errors": [], + "actions": [], + } assert class_imbalance_check.validate(X, y_long) == { - "warnings": [DataCheckWarning(message="The following labels fall below 10% of the target: [0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [0]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="The following labels fall below 10% of the target: [0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [0]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } - assert ClassImbalanceDataCheck(threshold=0.25, min_samples=1, num_cv_folds=0).validate(X, y_long) == { - "warnings": [DataCheckWarning(message="The following labels fall below 25% of the target: [0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [0]}).to_dict()], + assert ClassImbalanceDataCheck( + threshold=0.25, min_samples=1, num_cv_folds=0 + ).validate(X, y_long) == { + "warnings": [ + DataCheckWarning( + message="The following labels fall below 25% of the target: [0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [0]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1) assert class_imbalance_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 2 instances: [1]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [1]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 2 instances: [1]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [1]}, + ).to_dict() + ], + "actions": [], } - assert class_imbalance_check.validate(X, y_balanced) == {"warnings": [], "errors": [], "actions": []} + assert class_imbalance_check.validate(X, y_balanced) == { + "warnings": [], + "errors": [], + "actions": [], + } class_imbalance_check = ClassImbalanceDataCheck() assert class_imbalance_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0, 1]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [0, 1]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0, 1]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [0, 1]}, + ).to_dict() + ], + "actions": [], } @@ -98,7 +124,9 @@ def test_class_imbalance_data_check_multiclass(input_type): X = pd.DataFrame() y = pd.Series([0, 2, 1, 1]) y_imbalanced_default_threshold = pd.Series([0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) - y_imbalanced_set_threshold = pd.Series([0, 2, 2, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + y_imbalanced_set_threshold = pd.Series( + [0, 2, 2, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + ) y_imbalanced_cv = pd.Series([0, 1, 2, 2, 1, 1, 1]) y_long = pd.Series([0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4]) @@ -119,56 +147,84 @@ def test_class_imbalance_data_check_multiclass(input_type): y_long = ww.init_series(y_long) class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=0) - assert class_imbalance_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} + assert class_imbalance_check.validate(X, y) == { + "warnings": [], + "errors": [], + "actions": [], + } assert class_imbalance_check.validate(X, y_imbalanced_default_threshold) == { - "warnings": [DataCheckWarning(message="The following labels fall below 10% of the target: [0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [0]}).to_dict(), - DataCheckWarning(message="The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than 100 samples: [0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": [0]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="The following labels fall below 10% of the target: [0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [0]}, + ).to_dict(), + DataCheckWarning( + message="The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than 100 samples: [0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": [0]}, + ).to_dict(), + ], "errors": [], - "actions": [] + "actions": [], } - assert ClassImbalanceDataCheck(threshold=0.25, num_cv_folds=0, min_samples=1).validate(X, y_imbalanced_set_threshold) == { - "warnings": [DataCheckWarning(message="The following labels fall below 25% of the target: [3, 0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [3, 0]}).to_dict()], + assert ClassImbalanceDataCheck( + threshold=0.25, num_cv_folds=0, min_samples=1 + ).validate(X, y_imbalanced_set_threshold) == { + "warnings": [ + DataCheckWarning( + message="The following labels fall below 25% of the target: [3, 0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [3, 0]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=2) assert class_imbalance_check.validate(X, y_imbalanced_cv) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 4 instances: [2, 0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [2, 0]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 4 instances: [2, 0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [2, 0]}, + ).to_dict() + ], + "actions": [], } assert class_imbalance_check.validate(X, y_long) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 4 instances: [1, 0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [1, 0]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 4 instances: [1, 0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [1, 0]}, + ).to_dict() + ], + "actions": [], } class_imbalance_check = ClassImbalanceDataCheck() assert class_imbalance_check.validate(X, y_long) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [3, 2, 1, 0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [3, 2, 1, 0]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [3, 2, 1, 0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [3, 2, 1, 0]}, + ).to_dict() + ], + "actions": [], } @@ -189,45 +245,79 @@ def test_class_imbalance_empty_and_nan(input_type): y_has_nan = ww.init_series(y_has_nan) class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=0) - assert class_imbalance_check.validate(X, y_empty) == {"warnings": [], "errors": [], "actions": []} - assert ClassImbalanceDataCheck(threshold=0.5, min_samples=1, num_cv_folds=0).validate(X, y_has_nan) == { - "warnings": [DataCheckWarning(message="The following labels fall below 50% of the target: [2.0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [2.0]}).to_dict()], + assert class_imbalance_check.validate(X, y_empty) == { + "warnings": [], + "errors": [], + "actions": [], + } + assert ClassImbalanceDataCheck( + threshold=0.5, min_samples=1, num_cv_folds=0 + ).validate(X, y_has_nan) == { + "warnings": [ + DataCheckWarning( + message="The following labels fall below 50% of the target: [2.0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [2.0]}, + ).to_dict() + ], "errors": [], - "actions": [] - } - - assert ClassImbalanceDataCheck(threshold=0.5, num_cv_folds=0).validate(X, y_has_nan) == { - "warnings": [DataCheckWarning(message="The following labels fall below 50% of the target: [2.0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [2.0]}).to_dict(), - DataCheckWarning(message="The following labels in the target have severe class imbalance because they fall under 50% of the target and have less than 100 samples: [2.0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": [2.0]}).to_dict()], + "actions": [], + } + + assert ClassImbalanceDataCheck(threshold=0.5, num_cv_folds=0).validate( + X, y_has_nan + ) == { + "warnings": [ + DataCheckWarning( + message="The following labels fall below 50% of the target: [2.0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [2.0]}, + ).to_dict(), + DataCheckWarning( + message="The following labels in the target have severe class imbalance because they fall under 50% of the target and have less than 100 samples: [2.0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": [2.0]}, + ).to_dict(), + ], "errors": [], - "actions": [] + "actions": [], } class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1) - assert class_imbalance_check.validate(X, y_empty) == {"warnings": [], "errors": [], "actions": []} - assert ClassImbalanceDataCheck(threshold=0.5, num_cv_folds=1).validate(X, y_has_nan) == { - "warnings": [DataCheckWarning(message="The following labels fall below 50% of the target: [2.0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [2.0]}).to_dict(), - DataCheckWarning(message="The following labels in the target have severe class imbalance because they fall under 50% of the target and have less than 100 samples: [2.0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": [2.0]}).to_dict()], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 2 instances: [2.0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [2.0]}).to_dict()], - "actions": [] + assert class_imbalance_check.validate(X, y_empty) == { + "warnings": [], + "errors": [], + "actions": [], + } + assert ClassImbalanceDataCheck(threshold=0.5, num_cv_folds=1).validate( + X, y_has_nan + ) == { + "warnings": [ + DataCheckWarning( + message="The following labels fall below 50% of the target: [2.0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [2.0]}, + ).to_dict(), + DataCheckWarning( + message="The following labels in the target have severe class imbalance because they fall under 50% of the target and have less than 100 samples: [2.0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": [2.0]}, + ).to_dict(), + ], + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 2 instances: [2.0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [2.0]}, + ).to_dict() + ], + "actions": [], } @@ -236,7 +326,22 @@ def test_class_imbalance_nonnumeric(input_type): X = pd.DataFrame() y_bools = pd.Series([True, False, False, False, False]) y_binary = pd.Series(["yes", "no", "yes", "yes", "yes"]) - y_multiclass = pd.Series(["red", "green", "red", "red", "blue", "green", "red", "blue", "green", "red", "red", "red"]) + y_multiclass = pd.Series( + [ + "red", + "green", + "red", + "red", + "blue", + "green", + "red", + "blue", + "green", + "red", + "red", + "red", + ] + ) y_multiclass_imbalanced_folds = pd.Series(["No", "Maybe", "Maybe", "No", "Yes"]) y_binary_imbalanced_folds = pd.Series(["No", "Yes", "No", "Yes", "No"]) if input_type == "ww": @@ -245,66 +350,100 @@ def test_class_imbalance_nonnumeric(input_type): y_binary = ww.init_series(y_binary) y_multiclass = ww.init_series(y_multiclass) - class_imbalance_check = ClassImbalanceDataCheck(threshold=0.25, min_samples=1, num_cv_folds=0) + class_imbalance_check = ClassImbalanceDataCheck( + threshold=0.25, min_samples=1, num_cv_folds=0 + ) assert class_imbalance_check.validate(X, y_bools) == { - "warnings": [DataCheckWarning(message="The following labels fall below 25% of the target: [True]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [True]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="The following labels fall below 25% of the target: [True]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [True]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } assert class_imbalance_check.validate(X, y_binary) == { - "warnings": [DataCheckWarning(message="The following labels fall below 25% of the target: ['no']", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": ["no"]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="The following labels fall below 25% of the target: ['no']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": ["no"]}, + ).to_dict() + ], "errors": [], - "actions": [] - } - - assert ClassImbalanceDataCheck(threshold=0.35, num_cv_folds=0).validate(X, y_multiclass) == { - "warnings": [DataCheckWarning(message="The following labels fall below 35% of the target: ['green', 'blue']", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": ["green", "blue"]}).to_dict(), - DataCheckWarning(message="The following labels in the target have severe class imbalance because they fall under 35% of the target and have less than 100 samples: ['green', 'blue']", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": ["green", "blue"]}).to_dict()], + "actions": [], + } + + assert ClassImbalanceDataCheck(threshold=0.35, num_cv_folds=0).validate( + X, y_multiclass + ) == { + "warnings": [ + DataCheckWarning( + message="The following labels fall below 35% of the target: ['green', 'blue']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": ["green", "blue"]}, + ).to_dict(), + DataCheckWarning( + message="The following labels in the target have severe class imbalance because they fall under 35% of the target and have less than 100 samples: ['green', 'blue']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": ["green", "blue"]}, + ).to_dict(), + ], "errors": [], - "actions": [] + "actions": [], } class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1) assert class_imbalance_check.validate(X, y_multiclass_imbalanced_folds) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 2 instances: ['Yes']", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": ["Yes"]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 2 instances: ['Yes']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": ["Yes"]}, + ).to_dict() + ], + "actions": [], + } + assert class_imbalance_check.validate(X, y_multiclass) == { + "warnings": [], + "errors": [], + "actions": [], } - assert class_imbalance_check.validate(X, y_multiclass) == {"warnings": [], "errors": [], "actions": []} class_imbalance_check = ClassImbalanceDataCheck() assert class_imbalance_check.validate(X, y_binary_imbalanced_folds) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: ['No', 'Yes']", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": ["No", "Yes"]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: ['No', 'Yes']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": ["No", "Yes"]}, + ).to_dict() + ], + "actions": [], } assert class_imbalance_check.validate(X, y_multiclass) == { "warnings": [], - "errors": [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: ['green', 'blue']", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": ["green", "blue"]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: ['green', 'blue']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": ["green", "blue"]}, + ).to_dict() + ], + "actions": [], } @@ -313,7 +452,9 @@ def test_class_imbalance_nonnumeric_balanced(input_type): X = pd.DataFrame() y_bools_balanced = pd.Series([True, True, True, False, False]) y_binary_balanced = pd.Series(["No", "Yes", "No", "Yes"]) - y_multiclass_balanced = pd.Series(["red", "green", "red", "red", "blue", "green", "red", "blue", "green", "red"]) + y_multiclass_balanced = pd.Series( + ["red", "green", "red", "red", "blue", "green", "red", "blue", "green", "red"] + ) if input_type == "ww": X.ww.init() y_bools_balanced = ww.init_series(y_bools_balanced) @@ -321,9 +462,21 @@ def test_class_imbalance_nonnumeric_balanced(input_type): y_multiclass_balanced = ww.init_series(y_multiclass_balanced) class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1) - assert class_imbalance_check.validate(X, y_multiclass_balanced) == {"warnings": [], "errors": [], "actions": []} - assert class_imbalance_check.validate(X, y_binary_balanced) == {"warnings": [], "errors": [], "actions": []} - assert class_imbalance_check.validate(X, y_multiclass_balanced) == {"warnings": [], "errors": [], "actions": []} + assert class_imbalance_check.validate(X, y_multiclass_balanced) == { + "warnings": [], + "errors": [], + "actions": [], + } + assert class_imbalance_check.validate(X, y_binary_balanced) == { + "warnings": [], + "errors": [], + "actions": [], + } + assert class_imbalance_check.validate(X, y_multiclass_balanced) == { + "warnings": [], + "errors": [], + "actions": [], + } @pytest.mark.parametrize("input_type", ["pd", "ww"]) @@ -332,77 +485,107 @@ def test_class_imbalance_severe(min_samples, input_type): X = pd.DataFrame() # 0 will be < 10% of the data, but there will be 50 samples of it y_values_binary = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] * 50) - y_values_multiclass = pd.Series([0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] * 50) + y_values_multiclass = pd.Series( + [0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] * 50 + ) if input_type == "ww": X.ww.init() y_values_binary = ww.init_series(y_values_binary) y_values_multiclass = ww.init_series(y_values_multiclass) - class_imbalance_check = ClassImbalanceDataCheck(min_samples=min_samples, num_cv_folds=1) - warnings = [DataCheckWarning(message="The following labels fall below 10% of the target: [0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [0]}).to_dict()] + class_imbalance_check = ClassImbalanceDataCheck( + min_samples=min_samples, num_cv_folds=1 + ) + warnings = [ + DataCheckWarning( + message="The following labels fall below 10% of the target: [0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [0]}, + ).to_dict() + ] if min_samples > 50: - warnings.append(DataCheckWarning(message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: [0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": [0]}).to_dict()) + warnings.append( + DataCheckWarning( + message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: [0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": [0]}, + ).to_dict() + ) assert class_imbalance_check.validate(X, y_values_binary) == { "warnings": warnings, "errors": [], - "actions": [] + "actions": [], } assert class_imbalance_check.validate(X, y_values_multiclass) == { "warnings": warnings, "errors": [], - "actions": [] + "actions": [], } def test_class_imbalance_large_multiclass(): X = pd.DataFrame() - y_values_multiclass_large = pd.Series([0] * 20 + [1] * 25 + [2] * 99 + [3] * 105 + [4] * 900 + [5] * 900) + y_values_multiclass_large = pd.Series( + [0] * 20 + [1] * 25 + [2] * 99 + [3] * 105 + [4] * 900 + [5] * 900 + ) y_multiclass_huge = pd.Series([i % 200 for i in range(100000)]) - y_imbalanced_multiclass_huge = y_multiclass_huge.append(pd.Series([200] * 10), ignore_index=True) - y_imbalanced_multiclass_nan = y_multiclass_huge.append(pd.Series([np.nan] * 10), ignore_index=True) + y_imbalanced_multiclass_huge = y_multiclass_huge.append( + pd.Series([200] * 10), ignore_index=True + ) + y_imbalanced_multiclass_nan = y_multiclass_huge.append( + pd.Series([np.nan] * 10), ignore_index=True + ) class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1) assert class_imbalance_check.validate(X, y_values_multiclass_large) == { - "warnings": [DataCheckWarning(message="The following labels fall below 10% of the target: [2, 1, 0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [2, 1, 0]}).to_dict(), - DataCheckWarning(message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than 100 samples: [2, 1, 0]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": [2, 1, 0]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="The following labels fall below 10% of the target: [2, 1, 0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [2, 1, 0]}, + ).to_dict(), + DataCheckWarning( + message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than 100 samples: [2, 1, 0]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": [2, 1, 0]}, + ).to_dict(), + ], "errors": [], - "actions": [] + "actions": [], } assert class_imbalance_check.validate(X, y_multiclass_huge) == { "warnings": [], "errors": [], - "actions": [] + "actions": [], } assert class_imbalance_check.validate(X, y_imbalanced_multiclass_huge) == { - "warnings": [DataCheckWarning(message="The following labels fall below 10% of the target: [200]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, - details={"target_values": [200]}).to_dict(), - DataCheckWarning(message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than 100 samples: [200]", - data_check_name=class_imbalance_data_check_name, - message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, - details={"target_values": [200]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="The following labels fall below 10% of the target: [200]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": [200]}, + ).to_dict(), + DataCheckWarning( + message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than 100 samples: [200]", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": [200]}, + ).to_dict(), + ], "errors": [], - "actions": [] + "actions": [], } assert class_imbalance_check.validate(X, y_imbalanced_multiclass_nan) == { "warnings": [], "errors": [], - "actions": [] + "actions": [], } diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index 3dd93deb1e..03a3a0012e 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -2,10 +2,7 @@ import pytest from evalml.data_checks.data_check import DataCheck -from evalml.data_checks.data_check_message import ( - DataCheckError, - DataCheckWarning -) +from evalml.data_checks.data_check_message import DataCheckError, DataCheckWarning @pytest.fixture @@ -13,6 +10,7 @@ def mock_data_check_class(): class MockDataCheck(DataCheck): def validate(self, X, y=None): return [] + return MockDataCheck @@ -36,10 +34,16 @@ def test_data_check_validate_simple(X_y_binary): class MockDataCheck(DataCheck): def validate(self, X, y=None): - return [DataCheckError("error one", self.name), DataCheckWarning("warning one", self.name)] + return [ + DataCheckError("error one", self.name), + DataCheckWarning("warning one", self.name), + ] data_check = MockDataCheck() - assert data_check.validate(X, y=y) == [DataCheckError("error one", "MockDataCheck"), DataCheckWarning("warning one", "MockDataCheck")] + assert data_check.validate(X, y=y) == [ + DataCheckError("error one", "MockDataCheck"), + DataCheckWarning("warning one", "MockDataCheck"), + ] def test_data_check_with_param(): @@ -58,4 +62,6 @@ def validate(self, X, y=None): assert data_check.validate(X, y=None) == [] data_check = MockDataCheckWithParam(num=0) - assert data_check.validate(X, y=None) == [DataCheckError("Expected num == 10", "MockDataCheckWithParam")] + assert data_check.validate(X, y=None) == [ + DataCheckError("Expected num == 10", "MockDataCheckWithParam") + ] diff --git a/evalml/tests/data_checks_tests/test_data_check_action.py b/evalml/tests/data_checks_tests/test_data_check_action.py index 6f14abb2e3..6e8ffef783 100644 --- a/evalml/tests/data_checks_tests/test_data_check_action.py +++ b/evalml/tests/data_checks_tests/test_data_check_action.py @@ -10,7 +10,9 @@ def test_data_check_action_attributes(): assert data_check_action.action_code == DataCheckActionCode.DROP_COL assert data_check_action.metadata == {} - data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"columns": [1, 2]}) + data_check_action = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"columns": [1, 2]} + ) assert data_check_action.action_code == DataCheckActionCode.DROP_COL assert data_check_action.metadata == {"columns": [1, 2]} @@ -23,8 +25,12 @@ def test_data_check_action_equality(): assert data_check_action == data_check_action_eq assert data_check_action_eq == data_check_action - data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={'same detail': 'same same same'}) - data_check_action_eq = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={'same detail': 'same same same'}) + data_check_action = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"same detail": "same same same"} + ) + data_check_action_eq = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"same detail": "same same same"} + ) assert data_check_action == data_check_action assert data_check_action == data_check_action_eq @@ -33,7 +39,9 @@ def test_data_check_action_equality(): def test_data_check_action_inequality(): data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL) - data_check_action_diff = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"metadata": ["this is different"]}) + data_check_action_diff = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"metadata": ["this is different"]} + ) assert data_check_action != data_check_action_diff assert data_check_action_diff != data_check_action @@ -41,9 +49,22 @@ def test_data_check_action_inequality(): def test_data_check_action_to_dict(): data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL) - data_check_action_empty_metadata = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={}) - data_check_action_with_metadata = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"some detail": ["this is different"]}) + data_check_action_empty_metadata = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={} + ) + data_check_action_with_metadata = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"some detail": ["this is different"]} + ) - assert data_check_action.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "metadata": {}} - assert data_check_action_empty_metadata.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "metadata": {}} - assert data_check_action_with_metadata.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "metadata": {"some detail": ["this is different"]}} + assert data_check_action.to_dict() == { + "code": DataCheckActionCode.DROP_COL.name, + "metadata": {}, + } + assert data_check_action_empty_metadata.to_dict() == { + "code": DataCheckActionCode.DROP_COL.name, + "metadata": {}, + } + assert data_check_action_with_metadata.to_dict() == { + "code": DataCheckActionCode.DROP_COL.name, + "metadata": {"some detail": ["this is different"]}, + } diff --git a/evalml/tests/data_checks_tests/test_data_check_message.py b/evalml/tests/data_checks_tests/test_data_check_message.py index 96cd24e493..b3813715f6 100644 --- a/evalml/tests/data_checks_tests/test_data_check_message.py +++ b/evalml/tests/data_checks_tests/test_data_check_message.py @@ -5,32 +5,38 @@ DataCheckMessage, DataCheckMessageCode, DataCheckMessageType, - DataCheckWarning + DataCheckWarning, ) @pytest.fixture def data_check_message(): - return DataCheckMessage(message="test message", - data_check_name="test data check message name", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"message detail": "some message detail"}) + return DataCheckMessage( + message="test message", + data_check_name="test data check message name", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"message detail": "some message detail"}, + ) @pytest.fixture def data_check_warning(): - return DataCheckWarning(message="test warning", - data_check_name="test data check warning name", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"warning detail": "some warning detail"}) + return DataCheckWarning( + message="test warning", + data_check_name="test data check warning name", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"warning detail": "some warning detail"}, + ) @pytest.fixture def data_check_error(): - return DataCheckError(message="test error", - data_check_name="test data check error name", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"error detail": "some error detail"}) + return DataCheckError( + message="test error", + data_check_name="test data check error name", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"error detail": "some error detail"}, + ) def test_data_check_message_attributes(data_check_message): @@ -46,10 +52,17 @@ def test_data_check_message_str(data_check_message): def test_data_check_message_eq(data_check_message): - equal_msg = DataCheckMessage("test message", "test data check message name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"message detail": "some message detail"}) + equal_msg = DataCheckMessage( + "test message", + "test data check message name", + DataCheckMessageCode.HIGHLY_NULL_COLS, + {"message detail": "some message detail"}, + ) assert data_check_message == equal_msg - equal_msg = DataCheckMessage("different test message", "different test data check message name") + equal_msg = DataCheckMessage( + "different test message", "different test data check message name" + ) assert data_check_message != equal_msg @@ -66,10 +79,17 @@ def test_data_check_warning_str(data_check_warning): def test_data_check_warning_eq(data_check_warning): - equal_msg = DataCheckWarning("test warning", "test data check warning name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"warning detail": "some warning detail"}) + equal_msg = DataCheckWarning( + "test warning", + "test data check warning name", + DataCheckMessageCode.HIGHLY_NULL_COLS, + {"warning detail": "some warning detail"}, + ) assert data_check_warning == equal_msg - equal_msg = DataCheckWarning("different test warning", "different test data check warning name") + equal_msg = DataCheckWarning( + "different test warning", "different test data check warning name" + ) assert data_check_warning != equal_msg @@ -86,24 +106,33 @@ def test_data_check_error_str(data_check_error): def test_data_check_error_eq(data_check_error): - equal_msg = DataCheckError("test error", "test data check error name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"error detail": "some error detail"}) + equal_msg = DataCheckError( + "test error", + "test data check error name", + DataCheckMessageCode.HIGHLY_NULL_COLS, + {"error detail": "some error detail"}, + ) assert data_check_error == equal_msg - equal_msg = DataCheckError("different test warning", "different test data check error name") + equal_msg = DataCheckError( + "different test warning", "different test data check error name" + ) assert data_check_error != equal_msg def test_data_check_message_attributes_optional(): - data_check_warning = DataCheckWarning(message="test warning", - data_check_name="test data check warning name") + data_check_warning = DataCheckWarning( + message="test warning", data_check_name="test data check warning name" + ) assert data_check_warning.message == "test warning" assert data_check_warning.data_check_name == "test data check warning name" assert data_check_warning.message_type == DataCheckMessageType.WARNING assert data_check_warning.message_code is None assert data_check_warning.details is None - data_check_error = DataCheckError(message="test error", - data_check_name="test data check error name") + data_check_error = DataCheckError( + message="test error", data_check_name="test data check error name" + ) assert data_check_error.message == "test error" assert data_check_error.data_check_name == "test data check error name" assert data_check_error.message_type == DataCheckMessageType.ERROR @@ -118,42 +147,44 @@ def test_warning_error_eq(): def test_data_check_message_to_dict(): - error = DataCheckError(message="test message", - data_check_name="same test name", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"detail 1": "error info"}) + error = DataCheckError( + message="test message", + data_check_name="same test name", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"detail 1": "error info"}, + ) assert error.to_dict() == { "message": "test message", "level": "error", "data_check_name": "same test name", "code": DataCheckMessageCode.HIGHLY_NULL_COLS.name, - "details": {"detail 1": "error info"} + "details": {"detail 1": "error info"}, } - warning = DataCheckWarning(message="test message", - data_check_name="same test name", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"detail 1": "warning info"}) + warning = DataCheckWarning( + message="test message", + data_check_name="same test name", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"detail 1": "warning info"}, + ) assert warning.to_dict() == { "message": "test message", "level": "warning", "data_check_name": "same test name", "code": DataCheckMessageCode.HIGHLY_NULL_COLS.name, - "details": {"detail 1": "warning info"} + "details": {"detail 1": "warning info"}, } def test_data_check_message_to_dict_optional(): - error = DataCheckError(message="test message", - data_check_name="same test name") + error = DataCheckError(message="test message", data_check_name="same test name") assert error.to_dict() == { "message": "test message", "level": "error", - "data_check_name": "same test name" + "data_check_name": "same test name", } - warning = DataCheckWarning(message="test message", - data_check_name="same test name") + warning = DataCheckWarning(message="test message", data_check_name="same test name") assert warning.to_dict() == { "message": "test message", "level": "warning", - "data_check_name": "same test name" + "data_check_name": "same test name", } diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index 6c25223a80..de8f5d748c 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -15,7 +15,7 @@ DataChecks, DataCheckWarning, DefaultDataChecks, - EmptyDataChecks + EmptyDataChecks, ) from evalml.exceptions import DataCheckInitError @@ -34,30 +34,77 @@ def validate(self, X, y): class MockDataCheckWarning(DataCheck): def validate(self, X, y): - return {"warnings": [DataCheckWarning(message="warning one", data_check_name=self.name, message_code=None).to_dict()], - "errors": [], - "actions": []} + return { + "warnings": [ + DataCheckWarning( + message="warning one", + data_check_name=self.name, + message_code=None, + ).to_dict() + ], + "errors": [], + "actions": [], + } class MockDataCheckError(DataCheck): def validate(self, X, y): - return {"warnings": [], - "errors": [DataCheckError(message="error one", data_check_name=self.name, message_code=None).to_dict()], - "actions": []} + return { + "warnings": [], + "errors": [ + DataCheckError( + message="error one", + data_check_name=self.name, + message_code=None, + ).to_dict() + ], + "actions": [], + } class MockDataCheckErrorAndWarning(DataCheck): def validate(self, X, y): - return {"warnings": [DataCheckWarning(message="warning two", data_check_name=self.name, message_code=None).to_dict()], - "errors": [DataCheckError(message="error two", data_check_name=self.name, message_code=None).to_dict()], - "actions": []} - - data_checks_list = [MockDataCheck, MockDataCheckWarning, MockDataCheckError, MockDataCheckErrorAndWarning] + return { + "warnings": [ + DataCheckWarning( + message="warning two", + data_check_name=self.name, + message_code=None, + ).to_dict() + ], + "errors": [ + DataCheckError( + message="error two", + data_check_name=self.name, + message_code=None, + ).to_dict() + ], + "actions": [], + } + + data_checks_list = [ + MockDataCheck, + MockDataCheckWarning, + MockDataCheckError, + MockDataCheckErrorAndWarning, + ] data_checks = DataChecks(data_checks=data_checks_list) assert data_checks.validate(X, y) == { - "warnings": [DataCheckWarning(message="warning one", data_check_name="MockDataCheckWarning").to_dict(), - DataCheckWarning(message="warning two", data_check_name="MockDataCheckErrorAndWarning").to_dict()], - "errors": [DataCheckError(message="error one", data_check_name="MockDataCheckError").to_dict(), - DataCheckError(message="error two", data_check_name="MockDataCheckErrorAndWarning").to_dict()], - "actions": [] + "warnings": [ + DataCheckWarning( + message="warning one", data_check_name="MockDataCheckWarning" + ).to_dict(), + DataCheckWarning( + message="warning two", data_check_name="MockDataCheckErrorAndWarning" + ).to_dict(), + ], + "errors": [ + DataCheckError( + message="error one", data_check_name="MockDataCheckError" + ).to_dict(), + DataCheckError( + message="error two", data_check_name="MockDataCheckErrorAndWarning" + ).to_dict(), + ], + "actions": [], } @@ -74,65 +121,106 @@ def test_empty_data_checks(input_type, X_y_binary): assert data_checks.validate(X, y) == {"warnings": [], "errors": [], "actions": []} -messages = [DataCheckWarning(message="Column 'all_null' is 95.0% or more null", - data_check_name="HighlyNullDataCheck", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": "all_null", "pct_null_rows": 1.0}).to_dict(), - DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", - data_check_name="HighlyNullDataCheck", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": "also_all_null", "pct_null_rows": 1.0}).to_dict(), - DataCheckWarning(message="Column 'id' is 100.0% or more likely to be an ID column", - data_check_name="IDColumnsDataCheck", - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "id"}).to_dict(), - DataCheckError(message="1 row(s) (20.0%) of target values are null", - data_check_name="InvalidTargetDataCheck", - message_code=DataCheckMessageCode.TARGET_HAS_NULL, - details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(), - DataCheckError(message="lots_of_null has 1 unique value.", - data_check_name="NoVarianceDataCheck", - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "lots_of_null"}).to_dict(), - DataCheckError(message="all_null has 0 unique value.", - data_check_name="NoVarianceDataCheck", - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "all_null"}).to_dict(), - DataCheckError(message="also_all_null has 0 unique value.", - data_check_name="NoVarianceDataCheck", - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "also_all_null"}).to_dict(), - DataCheckError(message='Input natural language column(s) (natural_language_nan) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name="NaturalLanguageNaNDataCheck", - message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, - details={"columns": 'natural_language_nan'}).to_dict(), - DataCheckError(message='Input datetime column(s) (nan_dt_col) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name="DateTimeNaNDataCheck", - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - details={"columns": 'nan_dt_col'}).to_dict()] - -expected_actions = [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'id'}).to_dict(), - DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'most_frequent'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict()] +messages = [ + DataCheckWarning( + message="Column 'all_null' is 95.0% or more null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "all_null", "pct_null_rows": 1.0}, + ).to_dict(), + DataCheckWarning( + message="Column 'also_all_null' is 95.0% or more null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "also_all_null", "pct_null_rows": 1.0}, + ).to_dict(), + DataCheckWarning( + message="Column 'id' is 100.0% or more likely to be an ID column", + data_check_name="IDColumnsDataCheck", + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "id"}, + ).to_dict(), + DataCheckError( + message="1 row(s) (20.0%) of target values are null", + data_check_name="InvalidTargetDataCheck", + message_code=DataCheckMessageCode.TARGET_HAS_NULL, + details={"num_null_rows": 1, "pct_null_rows": 20.0}, + ).to_dict(), + DataCheckError( + message="lots_of_null has 1 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "lots_of_null"}, + ).to_dict(), + DataCheckError( + message="all_null has 0 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "all_null"}, + ).to_dict(), + DataCheckError( + message="also_all_null has 0 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "also_all_null"}, + ).to_dict(), + DataCheckError( + message="Input natural language column(s) (natural_language_nan) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name="NaturalLanguageNaNDataCheck", + message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, + details={"columns": "natural_language_nan"}, + ).to_dict(), + DataCheckError( + message="Input datetime column(s) (nan_dt_col) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name="DateTimeNaNDataCheck", + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + details={"columns": "nan_dt_col"}, + ).to_dict(), +] + +expected_actions = [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "all_null"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "also_all_null"} + ).to_dict(), + DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "id"}).to_dict(), + DataCheckAction( + DataCheckActionCode.IMPUTE_COL, + metadata={ + "column": None, + "is_target": True, + "impute_strategy": "most_frequent", + }, + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "lots_of_null"} + ).to_dict(), +] @pytest.mark.parametrize("input_type", ["pd", "ww"]) def test_default_data_checks_classification(input_type): - X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], - 'all_null': [None, None, None, None, None], - 'also_all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 4, 5], - 'id': [0, 1, 2, 3, 4], - 'has_label_leakage': [100, 200, 100, 200, 100], - 'natural_language_nan': [None, - "string_that_is_long_enough_for_natural_language_1", - "string_that_is_long_enough_for_natural_language_2", - "string_that_is_long_enough_for_natural_language_3", - "string_that_is_long_enough_for_natural_language_4"], - 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) - X['nan_dt_col'][0] = None + X = pd.DataFrame( + { + "lots_of_null": [None, None, None, None, "some data"], + "all_null": [None, None, None, None, None], + "also_all_null": [None, None, None, None, None], + "no_null": [1, 2, 3, 4, 5], + "id": [0, 1, 2, 3, 4], + "has_label_leakage": [100, 200, 100, 200, 100], + "natural_language_nan": [ + None, + "string_that_is_long_enough_for_natural_language_1", + "string_that_is_long_enough_for_natural_language_2", + "string_that_is_long_enough_for_natural_language_3", + "string_that_is_long_enough_for_natural_language_4", + ], + "nan_dt_col": pd.Series(pd.date_range("20200101", periods=5)), + } + ) + X["nan_dt_col"][0] = None y = pd.Series([0, 1, np.nan, 1, 0]) y_multiclass = pd.Series([0, 1, np.nan, 2, 0]) @@ -141,62 +229,111 @@ def test_default_data_checks_classification(input_type): y = ww.init_series(y) y_multiclass = ww.init_series(y_multiclass) - data_checks = DefaultDataChecks("binary", get_default_primary_search_objective("binary")) - imbalance = [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 1.0]", - data_check_name="ClassImbalanceDataCheck", - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [0.0, 1.0]}).to_dict()] + data_checks = DefaultDataChecks( + "binary", get_default_primary_search_objective("binary") + ) + imbalance = [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 1.0]", + data_check_name="ClassImbalanceDataCheck", + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [0.0, 1.0]}, + ).to_dict() + ] - assert data_checks.validate(X, y) == {"warnings": messages[:3], "errors": messages[3:] + imbalance, "actions": expected_actions} + assert data_checks.validate(X, y) == { + "warnings": messages[:3], + "errors": messages[3:] + imbalance, + "actions": expected_actions, + } - data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, - {"InvalidTargetDataCheck": {"problem_type": "binary", - "objective": get_default_primary_search_objective("binary")}}) - assert data_checks.validate(X, y) == {"warnings": messages[:3], "errors": messages[3:], "actions": expected_actions} + data_checks = DataChecks( + DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, + { + "InvalidTargetDataCheck": { + "problem_type": "binary", + "objective": get_default_primary_search_objective("binary"), + } + }, + ) + assert data_checks.validate(X, y) == { + "warnings": messages[:3], + "errors": messages[3:], + "actions": expected_actions, + } # multiclass - imbalance = [DataCheckError(message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 2.0, 1.0]", - data_check_name="ClassImbalanceDataCheck", - message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, - details={"target_values": [0.0, 2.0, 1.0]}).to_dict()] - min_2_class_count = [DataCheckError(message="Target does not have at least two instances per class which is required for multiclass classification", - data_check_name="InvalidTargetDataCheck", - message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, - details={"least_populated_class_labels": [2.0, 1.0]}).to_dict()] - high_class_to_sample_ratio = [DataCheckWarning( - message="Target has a large number of unique values, could be regression type problem.", - data_check_name="InvalidTargetDataCheck", - message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, - details={'class_to_value_ratio': 0.6}).to_dict()] + imbalance = [ + DataCheckError( + message="The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 2.0, 1.0]", + data_check_name="ClassImbalanceDataCheck", + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, + details={"target_values": [0.0, 2.0, 1.0]}, + ).to_dict() + ] + min_2_class_count = [ + DataCheckError( + message="Target does not have at least two instances per class which is required for multiclass classification", + data_check_name="InvalidTargetDataCheck", + message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, + details={"least_populated_class_labels": [2.0, 1.0]}, + ).to_dict() + ] + high_class_to_sample_ratio = [ + DataCheckWarning( + message="Target has a large number of unique values, could be regression type problem.", + data_check_name="InvalidTargetDataCheck", + message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, + details={"class_to_value_ratio": 0.6}, + ).to_dict() + ] # multiclass - data_checks = DefaultDataChecks("multiclass", get_default_primary_search_objective("multiclass")) - assert data_checks.validate(X, y_multiclass) == {"warnings": messages[:3] + high_class_to_sample_ratio, - "errors": [messages[3]] + min_2_class_count + messages[4:] + imbalance, - "actions": expected_actions} + data_checks = DefaultDataChecks( + "multiclass", get_default_primary_search_objective("multiclass") + ) + assert data_checks.validate(X, y_multiclass) == { + "warnings": messages[:3] + high_class_to_sample_ratio, + "errors": [messages[3]] + min_2_class_count + messages[4:] + imbalance, + "actions": expected_actions, + } - data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, - {"InvalidTargetDataCheck": {"problem_type": "multiclass", - "objective": get_default_primary_search_objective("multiclass")}}) - assert data_checks.validate(X, y_multiclass) == {"warnings": messages[:3] + high_class_to_sample_ratio, - "errors": [messages[3]] + min_2_class_count + messages[4:], - "actions": expected_actions} + data_checks = DataChecks( + DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, + { + "InvalidTargetDataCheck": { + "problem_type": "multiclass", + "objective": get_default_primary_search_objective("multiclass"), + } + }, + ) + assert data_checks.validate(X, y_multiclass) == { + "warnings": messages[:3] + high_class_to_sample_ratio, + "errors": [messages[3]] + min_2_class_count + messages[4:], + "actions": expected_actions, + } @pytest.mark.parametrize("input_type", ["pd", "ww"]) def test_default_data_checks_regression(input_type): - X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], - 'all_null': [None, None, None, None, None], - 'also_all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 5, 5], - 'id': [0, 1, 2, 3, 4], - 'has_label_leakage': [100, 200, 100, 200, 100], - 'natural_language_nan': [None, - "string_that_is_long_enough_for_natural_language_1", - "string_that_is_long_enough_for_natural_language_2", - "string_that_is_long_enough_for_natural_language_3", - "string_that_is_long_enough_for_natural_language_4"], - 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) - X['nan_dt_col'][0] = None + X = pd.DataFrame( + { + "lots_of_null": [None, None, None, None, "some data"], + "all_null": [None, None, None, None, None], + "also_all_null": [None, None, None, None, None], + "no_null": [1, 2, 3, 5, 5], + "id": [0, 1, 2, 3, 4], + "has_label_leakage": [100, 200, 100, 200, 100], + "natural_language_nan": [ + None, + "string_that_is_long_enough_for_natural_language_1", + "string_that_is_long_enough_for_natural_language_2", + "string_that_is_long_enough_for_natural_language_3", + "string_that_is_long_enough_for_natural_language_4", + ], + "nan_dt_col": pd.Series(pd.date_range("20200101", periods=5)), + } + ) + X["nan_dt_col"][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) @@ -204,95 +341,179 @@ def test_default_data_checks_regression(input_type): X.ww.init() y = ww.init_series(y) y_no_variance = ww.init_series(y_no_variance) - null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target", - data_check_name="TargetLeakageDataCheck", - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "lots_of_null"}).to_dict()] - data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) - id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target", - data_check_name="TargetLeakageDataCheck", - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "id"}).to_dict()] - nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target", - data_check_name="TargetLeakageDataCheck", - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "nan_dt_col"}).to_dict()] - - impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict() - nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict() - expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] - assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, - "errors": messages[3:], - "actions": expected_actions_with_drop_and_impute} + null_leakage = [ + DataCheckWarning( + message="Column 'lots_of_null' is 95.0% or more correlated with the target", + data_check_name="TargetLeakageDataCheck", + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "lots_of_null"}, + ).to_dict() + ] + data_checks = DefaultDataChecks( + "regression", get_default_primary_search_objective("regression") + ) + id_leakage_warning = [ + DataCheckWarning( + message="Column 'id' is 95.0% or more correlated with the target", + data_check_name="TargetLeakageDataCheck", + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "id"}, + ).to_dict() + ] + nan_dt_leakage_warning = [ + DataCheckWarning( + message="Column 'nan_dt_col' is 95.0% or more correlated with the target", + data_check_name="TargetLeakageDataCheck", + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "nan_dt_col"}, + ).to_dict() + ] + + impute_action = DataCheckAction( + DataCheckActionCode.IMPUTE_COL, + metadata={"column": None, "is_target": True, "impute_strategy": "mean"}, + ).to_dict() + nan_dt_action = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "nan_dt_col"} + ).to_dict() + expected_actions_with_drop_and_impute = ( + expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] + ) + assert data_checks.validate(X, y) == { + "warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, + "errors": messages[3:], + "actions": expected_actions_with_drop_and_impute, + } # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, - "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.", - data_check_name="NoVarianceDataCheck", - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "Y"}).to_dict()] + messages[7:], - "actions": expected_actions[:3] + expected_actions[4:] + "errors": messages[4:7] + + [ + DataCheckError( + message="Y has 1 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "Y"}, + ).to_dict() + ] + + messages[7:], + "actions": expected_actions[:3] + expected_actions[4:], } - data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, - {"InvalidTargetDataCheck": {"problem_type": "regression", - "objective": get_default_primary_search_objective("regression")}}) - assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, - "errors": messages[3:], - "actions": expected_actions_with_drop_and_impute} + data_checks = DataChecks( + DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, + { + "InvalidTargetDataCheck": { + "problem_type": "regression", + "objective": get_default_primary_search_objective("regression"), + } + }, + ) + assert data_checks.validate(X, y) == { + "warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, + "errors": messages[3:], + "actions": expected_actions_with_drop_and_impute, + } def test_default_data_checks_null_rows(): - class SeriesWrap(): + class SeriesWrap: def __init__(self, series): self.series = series def __eq__(self, series_2): return all(self.series.eq(series_2.series)) - X = pd.DataFrame({'all_null': [None, None, None, None, None], - 'also_all_null': [None, None, None, None, None]}) + X = pd.DataFrame( + { + "all_null": [None, None, None, None, None], + "also_all_null": [None, None, None, None, None], + } + ) y = pd.Series([0, 1, np.nan, 1, 0]) - data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) + data_checks = DefaultDataChecks( + "regression", get_default_primary_search_objective("regression") + ) highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0])) expected = { - "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null", - data_check_name="HighlyNullDataCheck", - message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, - details={"pct_null_cols": highly_null_rows}).to_dict(), - DataCheckWarning(message="Column 'all_null' is 95.0% or more null", - data_check_name="HighlyNullDataCheck", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(), - DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", - data_check_name="HighlyNullDataCheck", - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()], - "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null", - data_check_name="InvalidTargetDataCheck", - message_code=DataCheckMessageCode.TARGET_HAS_NULL, - details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(), - DataCheckError(message="all_null has 0 unique value.", - data_check_name="NoVarianceDataCheck", - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "all_null"}).to_dict(), - DataCheckError(message="also_all_null has 0 unique value.", - data_check_name="NoVarianceDataCheck", - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "also_all_null"}).to_dict()], - "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(), - DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]} + "warnings": [ + DataCheckWarning( + message="5 out of 5 rows are more than 95.0% null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}, + ).to_dict(), + DataCheckWarning( + message="Column 'all_null' is 95.0% or more null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "all_null", "pct_null_rows": 1.0}, + ).to_dict(), + DataCheckWarning( + message="Column 'also_all_null' is 95.0% or more null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "also_all_null", "pct_null_rows": 1.0}, + ).to_dict(), + ], + "errors": [ + DataCheckError( + message="1 row(s) (20.0%) of target values are null", + data_check_name="InvalidTargetDataCheck", + message_code=DataCheckMessageCode.TARGET_HAS_NULL, + details={"num_null_rows": 1, "pct_null_rows": 20.0}, + ).to_dict(), + DataCheckError( + message="all_null has 0 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "all_null"}, + ).to_dict(), + DataCheckError( + message="also_all_null has 0 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "also_all_null"}, + ).to_dict(), + ], + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "all_null"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "also_all_null"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.IMPUTE_COL, + metadata={"column": None, "is_target": True, "impute_strategy": "mean"}, + ).to_dict(), + ], + } validation_results = data_checks.validate(X, y) - validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols']) + validation_results["warnings"][0]["details"]["pct_null_cols"] = SeriesWrap( + validation_results["warnings"][0]["details"]["pct_null_cols"] + ) assert validation_results == expected def test_default_data_checks_time_series_regression(): - regression_data_check_classes = [check.__class__ for check in DefaultDataChecks("regression", get_default_primary_search_objective("regression")).data_checks] - ts_regression_data_check_classes = [check.__class__ for check in DefaultDataChecks("time series regression", get_default_primary_search_objective("time series regression")).data_checks] + regression_data_check_classes = [ + check.__class__ + for check in DefaultDataChecks( + "regression", get_default_primary_search_objective("regression") + ).data_checks + ] + ts_regression_data_check_classes = [ + check.__class__ + for check in DefaultDataChecks( + "time series regression", + get_default_primary_search_objective("time series regression"), + ).data_checks + ] assert regression_data_check_classes == ts_regression_data_check_classes @@ -310,10 +531,15 @@ def validate(self, X, y=None): """Mock validate.""" return MockCheck + data_checks = [make_mock_data_check("check_1"), make_mock_data_check("check_2")] - checks = DataChecks(data_checks, - data_check_params={"check_1": {"foo": 1, "bar": 2}, - "check_2": {"foo": 3, "bar": 1, "baz": 4}}) + checks = DataChecks( + data_checks, + data_check_params={ + "check_1": {"foo": 1, "bar": 2}, + "check_2": {"foo": 3, "bar": 1, "baz": 4}, + }, + ) assert checks.data_checks[0].foo == 1 assert checks.data_checks[0].bar == 2 assert checks.data_checks[0].baz == 3 @@ -342,43 +568,95 @@ def validate(self, X, y=None): """Mock validate.""" -@pytest.mark.parametrize("classes,params,expected_exception,expected_message", - [([MockCheck], {"mock_check": 1}, DataCheckInitError, - "Parameters for mock_check were not in a dictionary. Received 1."), - ([MockCheck], {"mock_check": {"foo": 1}}, DataCheckInitError, - r"Encountered the following error while initializing mock_check: __init__\(\) missing 1 required positional argument: 'bar'"), - ([MockCheck], {"mock_check": {"Bar": 2}}, DataCheckInitError, - r"Encountered the following error while initializing mock_check: __init__\(\) got an unexpected keyword argument 'Bar'"), - ([MockCheck], {"mock_check": {"fo": 3, "ba": 4}}, DataCheckInitError, - r"Encountered the following error while initializing mock_check: __init__\(\) got an unexpected keyword argument 'fo'"), - ([MockCheck], {"MockCheck": {"foo": 2, "bar": 4}}, DataCheckInitError, - "Class MockCheck was provided in params dictionary but it does not match any name in the data_check_classes list."), - ([MockCheck, MockCheck2], {"MockCheck": {"foo": 2, "bar": 4}}, DataCheckInitError, - "Class mock_check was provided in the data_checks_classes list but it does not have an entry in the parameters dictionary."), - ([1], None, ValueError, ("All elements of parameter data_checks must be an instance of DataCheck " + - "or a DataCheck class with any desired parameters specified in the " + - "data_check_params dictionary.")), - ([MockCheck], [1], ValueError, r"Params must be a dictionary. Received \[1\]")]) -def test_data_checks_raises_value_errors_on_init(classes, params, expected_exception, expected_message): +@pytest.mark.parametrize( + "classes,params,expected_exception,expected_message", + [ + ( + [MockCheck], + {"mock_check": 1}, + DataCheckInitError, + "Parameters for mock_check were not in a dictionary. Received 1.", + ), + ( + [MockCheck], + {"mock_check": {"foo": 1}}, + DataCheckInitError, + r"Encountered the following error while initializing mock_check: __init__\(\) missing 1 required positional argument: 'bar'", + ), + ( + [MockCheck], + {"mock_check": {"Bar": 2}}, + DataCheckInitError, + r"Encountered the following error while initializing mock_check: __init__\(\) got an unexpected keyword argument 'Bar'", + ), + ( + [MockCheck], + {"mock_check": {"fo": 3, "ba": 4}}, + DataCheckInitError, + r"Encountered the following error while initializing mock_check: __init__\(\) got an unexpected keyword argument 'fo'", + ), + ( + [MockCheck], + {"MockCheck": {"foo": 2, "bar": 4}}, + DataCheckInitError, + "Class MockCheck was provided in params dictionary but it does not match any name in the data_check_classes list.", + ), + ( + [MockCheck, MockCheck2], + {"MockCheck": {"foo": 2, "bar": 4}}, + DataCheckInitError, + "Class mock_check was provided in the data_checks_classes list but it does not have an entry in the parameters dictionary.", + ), + ( + [1], + None, + ValueError, + ( + "All elements of parameter data_checks must be an instance of DataCheck " + + "or a DataCheck class with any desired parameters specified in the " + + "data_check_params dictionary." + ), + ), + ([MockCheck], [1], ValueError, r"Params must be a dictionary. Received \[1\]"), + ], +) +def test_data_checks_raises_value_errors_on_init( + classes, params, expected_exception, expected_message +): with pytest.raises(expected_exception, match=expected_message): DataChecks(classes, params) -@pytest.mark.parametrize("objective", ["Root Mean Squared Log Error", "Mean Squared Log Error", "Mean Absolute Percentage Error"]) +@pytest.mark.parametrize( + "objective", + [ + "Root Mean Squared Log Error", + "Mean Squared Log Error", + "Mean Absolute Percentage Error", + ], +) def test_errors_warnings_in_invalid_target_data_check(objective, ts_data): X, y = ts_data y[0] = -1 y = pd.Series(y) details = {"Count of offending values": sum(val <= 0 for val in y.values.flatten())} - data_check_error = DataCheckError(message=f"Target has non-positive values which is not supported for {objective}", - data_check_name="InvalidTargetDataCheck", - message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, - details=details).to_dict() + data_check_error = DataCheckError( + message=f"Target has non-positive values which is not supported for {objective}", + data_check_name="InvalidTargetDataCheck", + message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, + details=details, + ).to_dict() - default_data_check = DefaultDataChecks(problem_type="time series regression", objective=objective).data_checks + default_data_check = DefaultDataChecks( + problem_type="time series regression", objective=objective + ).data_checks for check in default_data_check: if check.name == "InvalidTargetDataCheck": - assert check.validate(X, y) == {"warnings": [], "errors": [data_check_error], "actions": []} + assert check.validate(X, y) == { + "warnings": [], + "errors": [data_check_error], + "actions": [], + } def test_data_checks_do_not_duplicate_actions(X_y_binary): @@ -386,7 +664,15 @@ def test_data_checks_do_not_duplicate_actions(X_y_binary): class MockDataCheck(DataCheck): def validate(self, X, y): - return {"warnings": [], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'col_to_drop'}).to_dict()]} + return { + "warnings": [], + "errors": [], + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_to_drop"} + ).to_dict() + ], + } class MockDataCheckWithSameAction(DataCheck): def validate(self, X, y): @@ -399,15 +685,19 @@ def validate(self, X, y): assert data_checks.validate(X, y) == { "warnings": [], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'col_to_drop'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_to_drop"} + ).to_dict() + ], } def test_data_checks_drop_index(X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) - X['index_col'] = pd.Series(range(len(X))) - X.ww.init(index='index_col') + X["index_col"] = pd.Series(range(len(X))) + X.ww.init(index="index_col") class MockDataCheck(DataCheck): def validate(self, X, y): @@ -421,4 +711,4 @@ def validate(self, X, y): validate_args = MockDataCheck.validate.call_args_list for arg in validate_args: - assert 'index_col' not in arg[0][0].columns + assert "index_col" not in arg[0][0].columns diff --git a/evalml/tests/data_checks_tests/test_datetime_nan_data_check.py b/evalml/tests/data_checks_tests/test_datetime_nan_data_check.py index bcc137d85d..bdfc8330c1 100644 --- a/evalml/tests/data_checks_tests/test_datetime_nan_data_check.py +++ b/evalml/tests/data_checks_tests/test_datetime_nan_data_check.py @@ -4,32 +4,34 @@ from evalml.data_checks import ( DataCheckError, DataCheckMessageCode, - DateTimeNaNDataCheck + DateTimeNaNDataCheck, ) def test_datetime_nan_data_check_error(ts_data): data, _ = ts_data data.reset_index(inplace=True, drop=False) - data.at[0, 'index'] = np.NaN + data.at[0, "index"] = np.NaN dt_nan_check = DateTimeNaNDataCheck() assert dt_nan_check.validate(data) == { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=DateTimeNaNDataCheck.name, - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - details={"columns": 'index'}).to_dict()] + "errors": [ + DataCheckError( + message="Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=DateTimeNaNDataCheck.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + details={"columns": "index"}, + ).to_dict() + ], } def test_datetime_nan_data_check_error_numeric_columns_no_null(): dt_nan_check = DateTimeNaNDataCheck() - assert dt_nan_check.validate(pd.DataFrame(np.random.randint(0, 10, size=(10, 4)))) == { - "warnings": [], - "actions": [], - "errors": [] - } + assert dt_nan_check.validate( + pd.DataFrame(np.random.randint(0, 10, size=(10, 4))) + ) == {"warnings": [], "actions": [], "errors": []} def test_datetime_nan_data_check_error_numeric_null_columns(): @@ -37,43 +39,39 @@ def test_datetime_nan_data_check_error_numeric_null_columns(): data = data.replace(data.iloc[0][0], None) data = data.replace(data.iloc[1][1], None) dt_nan_check = DateTimeNaNDataCheck() - assert dt_nan_check.validate(data) == { - "warnings": [], - "actions": [], - "errors": [] - } + assert dt_nan_check.validate(data) == {"warnings": [], "actions": [], "errors": []} def test_datetime_nan_data_check_multiple_dt_no_nan(): data = pd.DataFrame() - data['A'] = pd.Series(pd.date_range('20200101', periods=3)) - data['B'] = pd.Series(pd.date_range('20200101', periods=3)) - data['C'] = np.random.randint(0, 5, size=len(data)) + data["A"] = pd.Series(pd.date_range("20200101", periods=3)) + data["B"] = pd.Series(pd.date_range("20200101", periods=3)) + data["C"] = np.random.randint(0, 5, size=len(data)) dt_nan_check = DateTimeNaNDataCheck() - assert dt_nan_check.validate(data) == { - "warnings": [], - "actions": [], - "errors": [] - } + assert dt_nan_check.validate(data) == {"warnings": [], "actions": [], "errors": []} def test_datetime_nan_data_check_multiple_nan_dt(): data = pd.DataFrame() - data['A'] = pd.Series(pd.date_range('20200101', periods=3)) + data["A"] = pd.Series(pd.date_range("20200101", periods=3)) data.loc[0][0] = None - data['B'] = pd.Series(pd.date_range('20200101', periods=3)) + data["B"] = pd.Series(pd.date_range("20200101", periods=3)) data.loc[0][1] = None - data['C'] = np.random.randint(0, 5, size=len(data)) + data["C"] = np.random.randint(0, 5, size=len(data)) dt_nan_check = DateTimeNaNDataCheck() assert dt_nan_check.validate(data) == { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input datetime column(s) (A, B) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=DateTimeNaNDataCheck.name, - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - details={"columns": 'A, B'}).to_dict()] + "errors": [ + DataCheckError( + message="Input datetime column(s) (A, B) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=DateTimeNaNDataCheck.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + details={"columns": "A, B"}, + ).to_dict() + ], } @@ -81,36 +79,63 @@ def test_datetime_nan_check_input_formats(): dt_nan_check = DateTimeNaNDataCheck() # test empty pd.DataFrame - assert dt_nan_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} + assert dt_nan_check.validate(pd.DataFrame()) == { + "warnings": [], + "errors": [], + "actions": [], + } expected = { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=DateTimeNaNDataCheck.name, - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - details={"columns": 'index'}).to_dict()] + "errors": [ + DataCheckError( + message="Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=DateTimeNaNDataCheck.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + details={"columns": "index"}, + ).to_dict() + ], } - dates = np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08')) - dates[0] = np.datetime64('NaT') + dates = np.arange(np.datetime64("2017-01-01"), np.datetime64("2017-01-08")) + dates[0] = np.datetime64("NaT") # test Woodwork - ww_input = pd.DataFrame(dates, columns=['index']) + ww_input = pd.DataFrame(dates, columns=["index"]) ww_input.ww.init() assert dt_nan_check.validate(ww_input) == expected expected = { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input datetime column(s) (0) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=DateTimeNaNDataCheck.name, - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - details={'columns': '0'}).to_dict()] + "errors": [ + DataCheckError( + message="Input datetime column(s) (0) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=DateTimeNaNDataCheck.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + details={"columns": "0"}, + ).to_dict() + ], } # test 2D list - assert dt_nan_check.validate([dates, np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08'))]) == expected + assert ( + dt_nan_check.validate( + [dates, np.arange(np.datetime64("2017-01-01"), np.datetime64("2017-01-08"))] + ) + == expected + ) # test np.array - assert dt_nan_check.validate(np.array([dates, np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08'))])) == expected + assert ( + dt_nan_check.validate( + np.array( + [ + dates, + np.arange(np.datetime64("2017-01-01"), np.datetime64("2017-01-08")), + ] + ) + ) + == expected + ) diff --git a/evalml/tests/data_checks_tests/test_highly_null_data_check.py b/evalml/tests/data_checks_tests/test_highly_null_data_check.py index 07b8315ea2..fe52315763 100644 --- a/evalml/tests/data_checks_tests/test_highly_null_data_check.py +++ b/evalml/tests/data_checks_tests/test_highly_null_data_check.py @@ -7,13 +7,13 @@ DataCheckActionCode, DataCheckMessageCode, DataCheckWarning, - HighlyNullDataCheck + HighlyNullDataCheck, ) highly_null_data_check_name = HighlyNullDataCheck.name -class SeriesWrap(): +class SeriesWrap: def __init__(self, series): self.series = series @@ -34,71 +34,124 @@ def test_highly_null_data_check_init(): highly_null_check = HighlyNullDataCheck(pct_null_threshold=1.0) assert highly_null_check.pct_null_threshold == 1.0 - with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, + match="pct_null_threshold must be a float between 0 and 1, inclusive.", + ): HighlyNullDataCheck(pct_null_threshold=-0.1) - with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, + match="pct_null_threshold must be a float between 0 and 1, inclusive.", + ): HighlyNullDataCheck(pct_null_threshold=1.1) def test_highly_null_data_check_warnings(): - data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 4, 5]}) + data = pd.DataFrame( + { + "lots_of_null": [None, None, None, None, 5], + "all_null": [None, None, None, None, None], + "no_null": [1, 2, 3, 4, 5], + } + ) no_null_check = HighlyNullDataCheck(pct_null_threshold=0.0) highly_null_rows = SeriesWrap(pd.Series([2 / 3, 2 / 3, 2 / 3, 2 / 3, 1 / 3])) validate_results = no_null_check.validate(data) - validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + validate_results["warnings"][0]["details"]["pct_null_cols"] = SeriesWrap( + validate_results["warnings"][0]["details"]["pct_null_cols"] + ) assert validate_results == { - "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 0.0% null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, - details={"pct_null_cols": highly_null_rows}).to_dict(), - DataCheckWarning(message="Column 'lots_of_null' is 0.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": "lots_of_null", "pct_null_rows": 0.8}).to_dict(), - DataCheckWarning(message="Column 'all_null' is 0.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": "all_null", "pct_null_rows": 1.0}).to_dict()], + "warnings": [ + DataCheckWarning( + message="5 out of 5 rows are more than 0.0% null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}, + ).to_dict(), + DataCheckWarning( + message="Column 'lots_of_null' is 0.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "lots_of_null", "pct_null_rows": 0.8}, + ).to_dict(), + DataCheckWarning( + message="Column 'all_null' is 0.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "all_null", "pct_null_rows": 1.0}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "lots_of_null"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "all_null"} + ).to_dict(), + ], } some_null_check = HighlyNullDataCheck(pct_null_threshold=0.5) highly_null_rows = SeriesWrap(pd.Series([2 / 3, 2 / 3, 2 / 3, 2 / 3])) validate_results = some_null_check.validate(data) - validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + validate_results["warnings"][0]["details"]["pct_null_cols"] = SeriesWrap( + validate_results["warnings"][0]["details"]["pct_null_cols"] + ) assert validate_results == { - "warnings": [DataCheckWarning(message="4 out of 5 rows are more than 50.0% null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, - details={"pct_null_cols": highly_null_rows}).to_dict(), - DataCheckWarning(message="Column 'lots_of_null' is 50.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": "lots_of_null", "pct_null_rows": 0.8}).to_dict(), - DataCheckWarning(message="Column 'all_null' is 50.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": "all_null", "pct_null_rows": 1.0}).to_dict()], + "warnings": [ + DataCheckWarning( + message="4 out of 5 rows are more than 50.0% null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}, + ).to_dict(), + DataCheckWarning( + message="Column 'lots_of_null' is 50.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "lots_of_null", "pct_null_rows": 0.8}, + ).to_dict(), + DataCheckWarning( + message="Column 'all_null' is 50.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "all_null", "pct_null_rows": 1.0}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3]}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] - + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3]} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "lots_of_null"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "all_null"} + ).to_dict(), + ], } all_null_check = HighlyNullDataCheck(pct_null_threshold=1.0) assert all_null_check.validate(data) == { - "warnings": [DataCheckWarning(message="Column 'all_null' is 100.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": "all_null", "pct_null_rows": 1.0}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'all_null' is 100.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": "all_null", "pct_null_rows": 1.0}, + ).to_dict() + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "all_null"} + ).to_dict() + ], } @@ -106,45 +159,79 @@ def test_highly_null_data_check_input_formats(): highly_null_check = HighlyNullDataCheck(pct_null_threshold=0.8) # test empty pd.DataFrame - assert highly_null_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} + assert highly_null_check.validate(pd.DataFrame()) == { + "warnings": [], + "errors": [], + "actions": [], + } highly_null_rows = SeriesWrap(pd.Series([0.8])) expected = { - "warnings": [DataCheckWarning(message="1 out of 2 rows are more than 80.0% null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, - details={"pct_null_cols": highly_null_rows}).to_dict(), - DataCheckWarning(message="Column '0' is 80.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": 0, "pct_null_rows": 1.0}).to_dict(), - DataCheckWarning(message="Column '1' is 80.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": 1, "pct_null_rows": 1.0}).to_dict(), - DataCheckWarning(message="Column '2' is 80.0% or more null", - data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, - details={"column": 2, "pct_null_rows": 1.0}).to_dict()], + "warnings": [ + DataCheckWarning( + message="1 out of 2 rows are more than 80.0% null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}, + ).to_dict(), + DataCheckWarning( + message="Column '0' is 80.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": 0, "pct_null_rows": 1.0}, + ).to_dict(), + DataCheckWarning( + message="Column '1' is 80.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": 1, "pct_null_rows": 1.0}, + ).to_dict(), + DataCheckWarning( + message="Column '2' is 80.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": 2, "pct_null_rows": 1.0}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0]}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 2}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_ROWS, metadata={"rows": [0]} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 0} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 1} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 2} + ).to_dict(), + ], } # test Woodwork ww_input = pd.DataFrame([[None, None, None, None, 0], [None, None, None, "hi", 5]]) ww_input.ww.init() validate_results = highly_null_check.validate(ww_input) - validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + validate_results["warnings"][0]["details"]["pct_null_cols"] = SeriesWrap( + validate_results["warnings"][0]["details"]["pct_null_cols"] + ) assert validate_results == expected # # test 2D list - validate_results = highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) - validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + validate_results = highly_null_check.validate( + [[None, None, None, None, 0], [None, None, None, "hi", 5]] + ) + validate_results["warnings"][0]["details"]["pct_null_cols"] = SeriesWrap( + validate_results["warnings"][0]["details"]["pct_null_cols"] + ) assert validate_results == expected # test np.array - validate_results = highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) - validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + validate_results = highly_null_check.validate( + np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]]) + ) + validate_results["warnings"][0]["details"]["pct_null_cols"] = SeriesWrap( + validate_results["warnings"][0]["details"]["pct_null_cols"] + ) assert validate_results == expected diff --git a/evalml/tests/data_checks_tests/test_id_columns_data_check.py b/evalml/tests/data_checks_tests/test_id_columns_data_check.py index cfe8364e5f..eb550fb85f 100644 --- a/evalml/tests/data_checks_tests/test_id_columns_data_check.py +++ b/evalml/tests/data_checks_tests/test_id_columns_data_check.py @@ -7,7 +7,7 @@ DataCheckActionCode, DataCheckMessageCode, DataCheckWarning, - IDColumnsDataCheck + IDColumnsDataCheck, ) id_data_check_name = IDColumnsDataCheck.name @@ -26,110 +26,185 @@ def test_id_cols_data_check_init(): id_cols_check = IDColumnsDataCheck(id_threshold=1.0) assert id_cols_check.id_threshold == 1.0 - with pytest.raises(ValueError, match="id_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="id_threshold must be a float between 0 and 1, inclusive." + ): IDColumnsDataCheck(id_threshold=-0.1) - with pytest.raises(ValueError, match="id_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="id_threshold must be a float between 0 and 1, inclusive." + ): IDColumnsDataCheck(id_threshold=1.1) def test_id_columns_warning(): - X_dict = {'col_1_id': [0, 1, 2, 3], - 'col_2': [2, 3, 4, 5], - 'col_3_id': [1, 1, 2, 3], - 'Id': [3, 1, 2, 0], - 'col_5': [0, 0, 1, 2], - 'col_6': [0.1, 0.2, 0.3, 0.4] - } + X_dict = { + "col_1_id": [0, 1, 2, 3], + "col_2": [2, 3, 4, 5], + "col_3_id": [1, 1, 2, 3], + "Id": [3, 1, 2, 0], + "col_5": [0, 0, 1, 2], + "col_6": [0.1, 0.2, 0.3, 0.4], + } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { - "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "Id"}).to_dict(), - DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_1_id"}).to_dict(), - DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_2"}).to_dict(), - DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_3_id"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'Id' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "Id"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_1_id' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_1_id"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_2' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_2"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_3_id' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_3_id"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "Id"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_2"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_3_id"}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "Id"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_2"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_3_id"} + ).to_dict(), + ], } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=1.0) assert id_cols_check.validate(X) == { - "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "Id"}).to_dict(), - DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_1_id"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'Id' is 100.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "Id"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_1_id' is 100.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_1_id"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "Id"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "Id"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"} + ).to_dict(), + ], } def test_id_columns_strings(): - X_dict = {'col_1_id': ["a", "b", "c", "d"], - 'col_2': ["w", "x", "y", "z"], - 'col_3_id': ["123456789012345", "234567890123456", "3456789012345678", "45678901234567"], - 'Id': ["z", "y", "x", "a"], - 'col_5': ["0", "0", "1", "2"], - 'col_6': [0.1, 0.2, 0.3, 0.4] - } + X_dict = { + "col_1_id": ["a", "b", "c", "d"], + "col_2": ["w", "x", "y", "z"], + "col_3_id": [ + "123456789012345", + "234567890123456", + "3456789012345678", + "45678901234567", + ], + "Id": ["z", "y", "x", "a"], + "col_5": ["0", "0", "1", "2"], + "col_6": [0.1, 0.2, 0.3, 0.4], + } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { - "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "Id"}).to_dict(), - DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_1_id"}).to_dict(), - DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_2"}).to_dict(), - DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_3_id"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'Id' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "Id"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_1_id' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_1_id"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_2' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_2"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_3_id' is 95.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_3_id"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "Id"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_2"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_3_id"}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "Id"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_2"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_3_id"} + ).to_dict(), + ], } id_cols_check = IDColumnsDataCheck(id_threshold=1.0) assert id_cols_check.validate(X) == { - "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "Id"}).to_dict(), - DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": "col_1_id"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'Id' is 100.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "Id"}, + ).to_dict(), + DataCheckWarning( + message="Column 'col_1_id' is 100.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": "col_1_id"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "Id"}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "Id"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "col_1_id"} + ).to_dict(), + ], } @@ -137,51 +212,93 @@ def test_id_cols_data_check_input_formats(): id_cols_check = IDColumnsDataCheck(id_threshold=0.8) # test empty pd.DataFrame - assert id_cols_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} + assert id_cols_check.validate(pd.DataFrame()) == { + "warnings": [], + "errors": [], + "actions": [], + } # test Woodwork ww_input = pd.DataFrame(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) ww_input.ww.init() assert id_cols_check.validate(ww_input) == { - "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": 0}).to_dict(), - DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": 1}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column '0' is 80.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": 0}, + ).to_dict(), + DataCheckWarning( + message="Column '1' is 80.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": 1}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 0} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 1} + ).to_dict(), + ], } # test 2D list assert id_cols_check.validate([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]) == { - "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": 0}).to_dict(), - DataCheckWarning("Column '1' is 80.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": 1}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column '0' is 80.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": 0}, + ).to_dict(), + DataCheckWarning( + "Column '1' is 80.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": 1}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 0} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 1} + ).to_dict(), + ], } # test np.array - assert id_cols_check.validate(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) == { - "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": 0}).to_dict(), - DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column", - data_check_name=id_data_check_name, - message_code=DataCheckMessageCode.HAS_ID_COLUMN, - details={"column": 1}).to_dict()], + assert id_cols_check.validate( + np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]) + ) == { + "warnings": [ + DataCheckWarning( + message="Column '0' is 80.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": 0}, + ).to_dict(), + DataCheckWarning( + message="Column '1' is 80.0% or more likely to be an ID column", + data_check_name=id_data_check_name, + message_code=DataCheckMessageCode.HAS_ID_COLUMN, + details={"column": 1}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 0} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 1} + ).to_dict(), + ], } diff --git a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py index 72aefae3a5..d83322aa1f 100644 --- a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py +++ b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py @@ -10,66 +10,85 @@ DataCheckMessageCode, DataChecks, DataCheckWarning, - InvalidTargetDataCheck + InvalidTargetDataCheck, ) from evalml.exceptions import DataCheckInitError -from evalml.objectives import ( - MAPE, - MeanSquaredLogError, - RootMeanSquaredLogError -) -from evalml.problem_types import ( - ProblemTypes, - is_binary, - is_multiclass, - is_regression -) +from evalml.objectives import MAPE, MeanSquaredLogError, RootMeanSquaredLogError +from evalml.problem_types import ProblemTypes, is_binary, is_multiclass, is_regression from evalml.utils.woodwork_utils import numeric_and_boolean_ww invalid_targets_data_check_name = InvalidTargetDataCheck.name def test_invalid_target_data_check_invalid_n_unique(): - with pytest.raises(ValueError, match="`n_unique` must be a non-negative integer value."): - InvalidTargetDataCheck("regression", get_default_primary_search_objective("regression"), n_unique=-1) + with pytest.raises( + ValueError, match="`n_unique` must be a non-negative integer value." + ): + InvalidTargetDataCheck( + "regression", + get_default_primary_search_objective("regression"), + n_unique=-1, + ) def test_invalid_target_data_check_nan_error(): X = pd.DataFrame({"col": [1, 2, 3]}) - invalid_targets_check = InvalidTargetDataCheck("regression", get_default_primary_search_objective("regression")) + invalid_targets_check = InvalidTargetDataCheck( + "regression", get_default_primary_search_objective("regression") + ) - assert invalid_targets_check.validate(X, y=pd.Series([1, 2, 3])) == {"warnings": [], "errors": [], "actions": []} + assert invalid_targets_check.validate(X, y=pd.Series([1, 2, 3])) == { + "warnings": [], + "errors": [], + "actions": [], + } assert invalid_targets_check.validate(X, y=pd.Series([np.nan, np.nan, np.nan])) == { "warnings": [], - "errors": [DataCheckError(message="Target is either empty or fully null.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, - details={}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Target is either empty or fully null.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, + details={}, + ).to_dict() + ], + "actions": [], } def test_invalid_target_data_check_numeric_binary_classification_valid_float(): y = pd.Series([0.0, 1.0, 0.0, 1.0]) X = pd.DataFrame({"col": range(len(y))}) - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) - assert invalid_targets_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary") + ) + assert invalid_targets_check.validate(X, y) == { + "warnings": [], + "errors": [], + "actions": [], + } def test_invalid_target_data_check_multiclass_two_examples_per_class(): y = pd.Series([0] + [1] * 19 + [2] * 80) X = pd.DataFrame({"col": range(len(y))}) - invalid_targets_check = InvalidTargetDataCheck("multiclass", get_default_primary_search_objective("binary")) + invalid_targets_check = InvalidTargetDataCheck( + "multiclass", get_default_primary_search_objective("binary") + ) expected_message = "Target does not have at least two instances per class which is required for multiclass classification" # with 1 class not having min 2 instances assert invalid_targets_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message=expected_message, - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, - details={"least_populated_class_labels": [0]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message=expected_message, + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, + details={"least_populated_class_labels": [0]}, + ).to_dict() + ], + "actions": [], } y = pd.Series([0] + [1] + [2] * 98) @@ -77,82 +96,128 @@ def test_invalid_target_data_check_multiclass_two_examples_per_class(): # with 2 classes not having min 2 instances assert invalid_targets_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message=expected_message, - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, - details={"least_populated_class_labels": [0, 1]}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message=expected_message, + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, + details={"least_populated_class_labels": [0, 1]}, + ).to_dict() + ], + "actions": [], } -@pytest.mark.parametrize("pd_type", ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']) +@pytest.mark.parametrize( + "pd_type", ["int16", "int32", "int64", "float16", "float32", "float64", "bool"] +) def test_invalid_target_data_check_invalid_pandas_data_types_error(pd_type): y = pd.Series([0, 1, 0, 0, 1, 0, 1, 0]) y = y.astype(pd_type) X = pd.DataFrame({"col": range(len(y))}) - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary") + ) - assert invalid_targets_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} + assert invalid_targets_check.validate(X, y) == { + "warnings": [], + "errors": [], + "actions": [], + } - y = pd.Series(pd.date_range('2000-02-03', periods=5, freq='W')) + y = pd.Series(pd.date_range("2000-02-03", periods=5, freq="W")) X = pd.DataFrame({"col": range(len(y))}) unique_values = y.value_counts().index.tolist() assert invalid_targets_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message="Target is unsupported {} type. Valid Woodwork logical types include: {}" - .format("Datetime", - ", ".join([ltype.type_string for ltype in numeric_and_boolean_ww])), - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, - details={"unsupported_type": "datetime"}).to_dict(), - DataCheckError(message="Binary class targets require exactly two unique values.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, - details={"target_values": unique_values}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Target is unsupported {} type. Valid Woodwork logical types include: {}".format( + "Datetime", + ", ".join([ltype.type_string for ltype in numeric_and_boolean_ww]), + ), + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, + details={"unsupported_type": "datetime"}, + ).to_dict(), + DataCheckError( + message="Binary class targets require exactly two unique values.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, + details={"target_values": unique_values}, + ).to_dict(), + ], + "actions": [], } def test_invalid_target_y_none(): - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary") + ) assert invalid_targets_check.validate(pd.DataFrame(), y=None) == { "warnings": [], - "errors": [DataCheckError(message="Target is None", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_IS_NONE, - details={}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Target is None", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_IS_NONE, + details={}, + ).to_dict() + ], + "actions": [], } def test_invalid_target_data_input_formats(): - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary") + ) # test empty pd.Series X = pd.DataFrame() messages = invalid_targets_check.validate(X, pd.Series()) assert messages == { "warnings": [], - "errors": [DataCheckError(message="Target is either empty or fully null.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, - details={}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Target is either empty or fully null.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, + details={}, + ).to_dict() + ], + "actions": [], } expected = { "warnings": [], - "errors": [DataCheckError(message="3 row(s) (75.0%) of target values are null", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_HAS_NULL, - details={"num_null_rows": 3, "pct_null_rows": 75}).to_dict(), - DataCheckError(message="Binary class targets require exactly two unique values.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, - details={"target_values": [0]}).to_dict()], - "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "most_frequent"}).to_dict()] + "errors": [ + DataCheckError( + message="3 row(s) (75.0%) of target values are null", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_HAS_NULL, + details={"num_null_rows": 3, "pct_null_rows": 75}, + ).to_dict(), + DataCheckError( + message="Binary class targets require exactly two unique values.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, + details={"target_values": [0]}, + ).to_dict(), + ], + "actions": [ + DataCheckAction( + DataCheckActionCode.IMPUTE_COL, + metadata={ + "column": None, + "is_target": True, + "impute_strategy": "most_frequent", + }, + ).to_dict() + ], } # test Woodwork y = pd.Series([None, None, None, 0]) @@ -175,21 +240,29 @@ def test_invalid_target_data_input_formats(): assert messages == expected -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]) +@pytest.mark.parametrize( + "problem_type", [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY] +) def test_invalid_target_data_check_n_unique(problem_type): y = pd.Series(list(range(100, 200)) + list(range(200))) unique_values = y.value_counts().index.tolist()[:100] # n_unique defaults to 100 X = pd.DataFrame({"col": range(len(y))}) - invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) + invalid_targets_check = InvalidTargetDataCheck( + problem_type, get_default_primary_search_objective(problem_type) + ) # Test default value of n_unique assert invalid_targets_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message="Binary class targets require exactly two unique values.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, - details={"target_values": unique_values}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Binary class targets require exactly two unique values.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, + details={"target_values": unique_values}, + ).to_dict() + ], + "actions": [], } # Test number of unique values < n_unique @@ -199,125 +272,217 @@ def test_invalid_target_data_check_n_unique(problem_type): unique_values = y.value_counts().index.tolist() assert invalid_targets_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message="Binary class targets require exactly two unique values.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, - details={"target_values": unique_values}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Binary class targets require exactly two unique values.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, + details={"target_values": unique_values}, + ).to_dict() + ], + "actions": [], } # Test n_unique is None - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"), - n_unique=None) + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary"), n_unique=None + ) y = pd.Series(range(150)) X = pd.DataFrame({"col": range(len(y))}) unique_values = y.value_counts().index.tolist() assert invalid_targets_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError(message="Binary class targets require exactly two unique values.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, - details={"target_values": unique_values}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Binary class targets require exactly two unique values.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, + details={"target_values": unique_values}, + ).to_dict() + ], + "actions": [], } -@pytest.mark.parametrize("objective", - ['Root Mean Squared Log Error', 'Mean Squared Log Error', 'Mean Absolute Percentage Error']) -def test_invalid_target_data_check_invalid_labels_for_nonnegative_objective_names(objective): - X = pd.DataFrame({'column_one': [100, 200, 100, 200, 200, 100, 200, 100] * 25}) +@pytest.mark.parametrize( + "objective", + [ + "Root Mean Squared Log Error", + "Mean Squared Log Error", + "Mean Absolute Percentage Error", + ], +) +def test_invalid_target_data_check_invalid_labels_for_nonnegative_objective_names( + objective, +): + X = pd.DataFrame({"column_one": [100, 200, 100, 200, 200, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, -1, -1, 1, 1] * 25) - data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", - "objective": objective}}) + data_checks = DataChecks( + [InvalidTargetDataCheck], + { + "InvalidTargetDataCheck": { + "problem_type": "multiclass", + "objective": objective, + } + }, + ) assert data_checks.validate(X, y) == { "warnings": [], - "errors": [DataCheckError( - message=f"Target has non-positive values which is not supported for {objective}", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, - details={"Count of offending values": sum(val <= 0 for val in y.values.flatten())}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message=f"Target has non-positive values which is not supported for {objective}", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, + details={ + "Count of offending values": sum( + val <= 0 for val in y.values.flatten() + ) + }, + ).to_dict() + ], + "actions": [], } - X = pd.DataFrame({'column_one': [100, 200, 100, 200, 100]}) + X = pd.DataFrame({"column_one": [100, 200, 100, 200, 100]}) y = pd.Series([2, 3, 0, 1, 1]) - invalid_targets_check = InvalidTargetDataCheck(problem_type="regression", objective=objective) + invalid_targets_check = InvalidTargetDataCheck( + problem_type="regression", objective=objective + ) assert invalid_targets_check.validate(X, y) == { "warnings": [], - "errors": [DataCheckError( - message=f"Target has non-positive values which is not supported for {objective}", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, - details={"Count of offending values": sum(val <= 0 for val in y.values.flatten())}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message=f"Target has non-positive values which is not supported for {objective}", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, + details={ + "Count of offending values": sum( + val <= 0 for val in y.values.flatten() + ) + }, + ).to_dict() + ], + "actions": [], } -@pytest.mark.parametrize("objective", [RootMeanSquaredLogError(), MeanSquaredLogError(), MAPE()]) -def test_invalid_target_data_check_invalid_labels_for_nonnegative_objective_instances(objective): - X = pd.DataFrame({'column_one': [100, 200, 100, 200, 200, 100, 200, 100] * 25}) +@pytest.mark.parametrize( + "objective", [RootMeanSquaredLogError(), MeanSquaredLogError(), MAPE()] +) +def test_invalid_target_data_check_invalid_labels_for_nonnegative_objective_instances( + objective, +): + X = pd.DataFrame({"column_one": [100, 200, 100, 200, 200, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, -1, -1, 1, 1] * 25) - data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", - "objective": objective}}) + data_checks = DataChecks( + [InvalidTargetDataCheck], + { + "InvalidTargetDataCheck": { + "problem_type": "multiclass", + "objective": objective, + } + }, + ) assert data_checks.validate(X, y) == { "warnings": [], - "errors": [DataCheckError( - message=f"Target has non-positive values which is not supported for {objective.name}", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, - details={"Count of offending values": sum(val <= 0 for val in y.values.flatten())}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message=f"Target has non-positive values which is not supported for {objective.name}", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, + details={ + "Count of offending values": sum( + val <= 0 for val in y.values.flatten() + ) + }, + ).to_dict() + ], + "actions": [], } -def test_invalid_target_data_check_invalid_labels_for_objectives(time_series_core_objectives): - X = pd.DataFrame({'column_one': [100, 200, 100, 200, 200, 100, 200, 100] * 25}) +def test_invalid_target_data_check_invalid_labels_for_objectives( + time_series_core_objectives, +): + X = pd.DataFrame({"column_one": [100, 200, 100, 200, 200, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, -1, -1, 1, 1] * 25) for objective in time_series_core_objectives: if not objective.positive_only: - data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", - "objective": objective}}) + data_checks = DataChecks( + [InvalidTargetDataCheck], + { + "InvalidTargetDataCheck": { + "problem_type": "multiclass", + "objective": objective, + } + }, + ) assert data_checks.validate(X, y) == { "warnings": [], "errors": [], - "actions": [] + "actions": [], } - X = pd.DataFrame({'column_one': [100, 200, 100, 200, 100]}) + X = pd.DataFrame({"column_one": [100, 200, 100, 200, 100]}) y = pd.Series([2, 3, 0, 1, 1]) for objective in time_series_core_objectives: if not objective.positive_only: - invalid_targets_check = InvalidTargetDataCheck(problem_type="regression", objective=objective) - assert invalid_targets_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} + invalid_targets_check = InvalidTargetDataCheck( + problem_type="regression", objective=objective + ) + assert invalid_targets_check.validate(X, y) == { + "warnings": [], + "errors": [], + "actions": [], + } -@pytest.mark.parametrize("objective", - ['Root Mean Squared Log Error', 'Mean Squared Log Error', 'Mean Absolute Percentage Error']) +@pytest.mark.parametrize( + "objective", + [ + "Root Mean Squared Log Error", + "Mean Squared Log Error", + "Mean Absolute Percentage Error", + ], +) def test_invalid_target_data_check_valid_labels_for_nonnegative_objectives(objective): - X = pd.DataFrame({'column_one': [100, 100, 200, 300, 100, 200, 100] * 25}) + X = pd.DataFrame({"column_one": [100, 100, 200, 300, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, 1, 1, 1] * 25) - data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", - "objective": objective}}) + data_checks = DataChecks( + [InvalidTargetDataCheck], + { + "InvalidTargetDataCheck": { + "problem_type": "multiclass", + "objective": objective, + } + }, + ) assert data_checks.validate(X, y) == {"warnings": [], "errors": [], "actions": []} def test_invalid_target_data_check_initialize_with_none_objective(): with pytest.raises(DataCheckInitError, match="Encountered the following error"): - DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", - "objective": None}}) - - -@pytest.mark.parametrize("problem_type", - ['regression']) + DataChecks( + [InvalidTargetDataCheck], + { + "InvalidTargetDataCheck": { + "problem_type": "multiclass", + "objective": None, + } + }, + ) + + +@pytest.mark.parametrize("problem_type", ["regression"]) def test_invalid_target_data_check_regression_problem_nonnumeric_data(problem_type): y_categorical = pd.Series(["Peace", "Is", "A", "Lie"] * 100) y_mixed_cat_numeric = pd.Series(["Peace", 2, "A", 4] * 100) @@ -329,14 +494,27 @@ def test_invalid_target_data_check_regression_problem_nonnumeric_data(problem_ty message=f"Target data type should be numeric for regression type problems.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, - details={}).to_dict() - - invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) - assert invalid_targets_check.validate(X=pd.DataFrame({"col": range(len(y_categorical))}), y=y_categorical) == {"warnings": [], "errors": [data_check_error], "actions": []} - assert invalid_targets_check.validate(X=pd.DataFrame({"col": range(len(y_mixed_cat_numeric))}), y=y_mixed_cat_numeric) == {"warnings": [], "errors": [data_check_error], "actions": []} - assert invalid_targets_check.validate(X=pd.DataFrame({"col": range(len(y_integer))}), y=y_integer) == {"warnings": [], "errors": [], "actions": []} - assert invalid_targets_check.validate(X=pd.DataFrame({"col": range(len(y_float))}), y=y_float) == {"warnings": [], "errors": [], "actions": []} - assert invalid_targets_check.validate(X=pd.DataFrame({"col": range(len(y_numeric))}), y=y_numeric) == {"warnings": [], "errors": [], "actions": []} + details={}, + ).to_dict() + + invalid_targets_check = InvalidTargetDataCheck( + problem_type, get_default_primary_search_objective(problem_type) + ) + assert invalid_targets_check.validate( + X=pd.DataFrame({"col": range(len(y_categorical))}), y=y_categorical + ) == {"warnings": [], "errors": [data_check_error], "actions": []} + assert invalid_targets_check.validate( + X=pd.DataFrame({"col": range(len(y_mixed_cat_numeric))}), y=y_mixed_cat_numeric + ) == {"warnings": [], "errors": [data_check_error], "actions": []} + assert invalid_targets_check.validate( + X=pd.DataFrame({"col": range(len(y_integer))}), y=y_integer + ) == {"warnings": [], "errors": [], "actions": []} + assert invalid_targets_check.validate( + X=pd.DataFrame({"col": range(len(y_float))}), y=y_float + ) == {"warnings": [], "errors": [], "actions": []} + assert invalid_targets_check.validate( + X=pd.DataFrame({"col": range(len(y_numeric))}), y=y_numeric + ) == {"warnings": [], "errors": [], "actions": []} def test_invalid_target_data_check_multiclass_problem_binary_data(): @@ -347,37 +525,70 @@ def test_invalid_target_data_check_multiclass_problem_binary_data(): message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, - details={"num_classes": len(set(y_binary))}).to_dict() - - invalid_targets_check = InvalidTargetDataCheck("multiclass", get_default_primary_search_objective("multiclass")) - assert invalid_targets_check.validate(X=pd.DataFrame({"col": range(len(y_multiclass))}), y=y_multiclass) == {"warnings": [], "errors": [], "actions": []} - assert invalid_targets_check.validate(X=pd.DataFrame({"col": range(len(y_binary))}), y=y_binary) == {"warnings": [], "errors": [data_check_error], "actions": []} - - -@pytest.mark.parametrize("problem_type", [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]) -def test_invalid_target_data_check_multiclass_problem_almost_continuous_data(problem_type): - invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) - y_multiclass_high_classes = pd.Series(list(range(0, 100)) * 3) # 100 classes, 300 samples, .33 class/sample ratio + details={"num_classes": len(set(y_binary))}, + ).to_dict() + + invalid_targets_check = InvalidTargetDataCheck( + "multiclass", get_default_primary_search_objective("multiclass") + ) + assert invalid_targets_check.validate( + X=pd.DataFrame({"col": range(len(y_multiclass))}), y=y_multiclass + ) == {"warnings": [], "errors": [], "actions": []} + assert invalid_targets_check.validate( + X=pd.DataFrame({"col": range(len(y_binary))}), y=y_binary + ) == {"warnings": [], "errors": [data_check_error], "actions": []} + + +@pytest.mark.parametrize( + "problem_type", [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS] +) +def test_invalid_target_data_check_multiclass_problem_almost_continuous_data( + problem_type, +): + invalid_targets_check = InvalidTargetDataCheck( + problem_type, get_default_primary_search_objective(problem_type) + ) + y_multiclass_high_classes = pd.Series( + list(range(0, 100)) * 3 + ) # 100 classes, 300 samples, .33 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_high_classes))}) data_check_warning = DataCheckWarning( message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, - details={"class_to_value_ratio": 1 / 3}).to_dict() - assert invalid_targets_check.validate(X, y=y_multiclass_high_classes) == {"warnings": [data_check_warning], "errors": [], "actions": []} + details={"class_to_value_ratio": 1 / 3}, + ).to_dict() + assert invalid_targets_check.validate(X, y=y_multiclass_high_classes) == { + "warnings": [data_check_warning], + "errors": [], + "actions": [], + } - y_multiclass_med_classes = pd.Series(list(range(0, 5)) * 20) # 5 classes, 100 samples, .05 class/sample ratio + y_multiclass_med_classes = pd.Series( + list(range(0, 5)) * 20 + ) # 5 classes, 100 samples, .05 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_med_classes))}) data_check_warning = DataCheckWarning( message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, - details={"class_to_value_ratio": .05}).to_dict() - assert invalid_targets_check.validate(X, y=y_multiclass_med_classes) == {"warnings": [data_check_warning], "errors": [], "actions": []} + details={"class_to_value_ratio": 0.05}, + ).to_dict() + assert invalid_targets_check.validate(X, y=y_multiclass_med_classes) == { + "warnings": [data_check_warning], + "errors": [], + "actions": [], + } - y_multiclass_low_classes = pd.Series(list(range(0, 3)) * 100) # 2 classes, 300 samples, .01 class/sample ratio + y_multiclass_low_classes = pd.Series( + list(range(0, 3)) * 100 + ) # 2 classes, 300 samples, .01 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_low_classes))}) - assert invalid_targets_check.validate(X, y=y_multiclass_low_classes) == {"warnings": [], "errors": [], "actions": []} + assert invalid_targets_check.validate(X, y=y_multiclass_low_classes) == { + "warnings": [], + "errors": [], + "actions": [], + } def test_invalid_target_data_check_mismatched_indices(): @@ -386,28 +597,48 @@ def test_invalid_target_data_check_mismatched_indices(): y_diff_index = pd.Series([0, 1, 0], index=[1, 5, 10]) y_diff_index_order = pd.Series([0, 1, 0], index=[0, 2, 1]) - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) - assert invalid_targets_check.validate(X=None, y=y_same_index) == {"warnings": [], "errors": [], "actions": []} - assert invalid_targets_check.validate(X, y_same_index) == {"warnings": [], "errors": [], "actions": []} + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary") + ) + assert invalid_targets_check.validate(X=None, y=y_same_index) == { + "warnings": [], + "errors": [], + "actions": [], + } + assert invalid_targets_check.validate(X, y_same_index) == { + "warnings": [], + "errors": [], + "actions": [], + } X_index_missing = list(set(y_diff_index.index) - set(X.index)) y_index_missing = list(set(X.index) - set(y_diff_index.index)) assert invalid_targets_check.validate(X, y_diff_index) == { - "warnings": [DataCheckWarning(message="Input target and features have mismatched indices", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.MISMATCHED_INDICES, - details={"indices_not_in_features": X_index_missing, - "indices_not_in_target": y_index_missing}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Input target and features have mismatched indices", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.MISMATCHED_INDICES, + details={ + "indices_not_in_features": X_index_missing, + "indices_not_in_target": y_index_missing, + }, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } assert invalid_targets_check.validate(X, y_diff_index_order) == { - "warnings": [DataCheckWarning(message="Input target and features have mismatched indices order", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER, - details={}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Input target and features have mismatched indices order", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER, + details={}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } # Test that we only store ten mismatches when there are more than 10 differences in indices found @@ -416,41 +647,61 @@ def test_invalid_target_data_check_mismatched_indices(): X_index_missing = list(set(y_more_than_ten_diff_indices.index) - set(X.index)) y_index_missing = list(set(X_large.index) - set(y_more_than_ten_diff_indices.index)) assert invalid_targets_check.validate(X_large, y_more_than_ten_diff_indices) == { - "warnings": [DataCheckWarning(message="Input target and features have mismatched indices", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.MISMATCHED_INDICES, - details={"indices_not_in_features": X_index_missing[:10], - "indices_not_in_target": y_index_missing[:10]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Input target and features have mismatched indices", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.MISMATCHED_INDICES, + details={ + "indices_not_in_features": X_index_missing[:10], + "indices_not_in_target": y_index_missing[:10], + }, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } def test_invalid_target_data_check_different_lengths(): X = pd.DataFrame({"col": [1, 2, 3]}) y_diff_len = pd.Series([0, 1]) - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) - assert invalid_targets_check.validate(X, y_diff_len) == {"warnings": [DataCheckWarning(message="Input target and features have different lengths", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, - details={"features_length": len(X.index), "target_length": len(y_diff_len.index)}).to_dict(), - DataCheckWarning(message="Input target and features have mismatched indices", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.MISMATCHED_INDICES, - details={"indices_not_in_features": [], - "indices_not_in_target": [2]}).to_dict()], - "errors": [], - "actions": []} + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary") + ) + assert invalid_targets_check.validate(X, y_diff_len) == { + "warnings": [ + DataCheckWarning( + message="Input target and features have different lengths", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, + details={ + "features_length": len(X.index), + "target_length": len(y_diff_len.index), + }, + ).to_dict(), + DataCheckWarning( + message="Input target and features have mismatched indices", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.MISMATCHED_INDICES, + details={"indices_not_in_features": [], "indices_not_in_target": [2]}, + ).to_dict(), + ], + "errors": [], + "actions": [], + } def test_invalid_target_data_check_numeric_binary_does_not_return_warnings(): y = pd.Series([1, 5, 1, 5, 1, 1]) X = pd.DataFrame({"col": range(len(y))}) - invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) + invalid_targets_check = InvalidTargetDataCheck( + "binary", get_default_primary_search_objective("binary") + ) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [], - "actions": [] + "actions": [], } @@ -458,31 +709,58 @@ def test_invalid_target_data_check_numeric_binary_does_not_return_warnings(): def test_invalid_target_data_action_for_data_with_null(problem_type): y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0]) X = pd.DataFrame({"col": range(len(y))}) - invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) + invalid_targets_check = InvalidTargetDataCheck( + problem_type, get_default_primary_search_objective(problem_type) + ) impute_strategy = "mean" if is_regression(problem_type) else "most_frequent" expected = { "warnings": [], - "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_HAS_NULL, - details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()], - "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()] + "errors": [ + DataCheckError( + message="3 row(s) (30.0%) of target values are null", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_HAS_NULL, + details={"num_null_rows": 3, "pct_null_rows": 30.0}, + ).to_dict() + ], + "actions": [ + DataCheckAction( + DataCheckActionCode.IMPUTE_COL, + metadata={ + "column": None, + "is_target": True, + "impute_strategy": impute_strategy, + }, + ).to_dict() + ], } if is_binary(problem_type): - expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, - details={"target_values": [0]}).to_dict()) + expected["errors"].append( + DataCheckError( + message="Binary class targets require exactly two unique values.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, + details={"target_values": [0]}, + ).to_dict() + ) elif is_multiclass(problem_type): - expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, - details={"num_classes": 1}).to_dict()) - expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, - details={"class_to_value_ratio": 0.1}).to_dict()) + expected["errors"].append( + DataCheckError( + message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, + details={"num_classes": 1}, + ).to_dict() + ) + expected["warnings"].append( + DataCheckWarning( + message=f"Target has a large number of unique values, could be regression type problem.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, + details={"class_to_value_ratio": 0.1}, + ).to_dict() + ) messages = invalid_targets_check.validate(X, y) assert messages == expected @@ -490,18 +768,24 @@ def test_invalid_target_data_action_for_data_with_null(problem_type): @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_invalid_target_data_action_for_all_null(problem_type): - invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) + invalid_targets_check = InvalidTargetDataCheck( + problem_type, get_default_primary_search_objective(problem_type) + ) y_all_null = pd.Series([None, None, None]) X = pd.DataFrame({"col": range(len(y_all_null))}) expected = { "warnings": [], - "errors": [DataCheckError(message="Target is either empty or fully null.", - data_check_name=invalid_targets_data_check_name, - message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, - details={}).to_dict()], - "actions": [] + "errors": [ + DataCheckError( + message="Target is either empty or fully null.", + data_check_name=invalid_targets_data_check_name, + message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL, + details={}, + ).to_dict() + ], + "actions": [], } messages = invalid_targets_check.validate(X, y_all_null) assert messages == expected diff --git a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py index 3d648ee891..b086ae74a2 100644 --- a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py +++ b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py @@ -4,7 +4,7 @@ from evalml.data_checks import ( DataCheckMessageCode, DataCheckWarning, - MulticollinearityDataCheck + MulticollinearityDataCheck, ) multi_data_check_name = MulticollinearityDataCheck.name @@ -23,52 +23,91 @@ def test_multicollinearity_data_check_init(): multi_check = MulticollinearityDataCheck(threshold=1.0) assert multi_check.threshold == 1.0 - with pytest.raises(ValueError, match="threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="threshold must be a float between 0 and 1, inclusive." + ): MulticollinearityDataCheck(threshold=-0.1) - with pytest.raises(ValueError, match="threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="threshold must be a float between 0 and 1, inclusive." + ): MulticollinearityDataCheck(threshold=1.1) def test_multicollinearity_returns_warning(): col = pd.Series([1, 0, 2, 3, 4]) - X = pd.DataFrame({'col_1': col, - 'col_2': col * 3, - 'col_3': ~col, - 'col_4': col / 2, - 'col_5': col + 1, - 'not_collinear': [0, 1, 0, 0, 0]}) + X = pd.DataFrame( + { + "col_1": col, + "col_2": col * 3, + "col_3": ~col, + "col_4": col / 2, + "col_5": col + 1, + "not_collinear": [0, 1, 0, 0, 0], + } + ) multi_check = MulticollinearityDataCheck(threshold=0.95) assert multi_check.validate(X) == { - "warnings": [DataCheckWarning(message="Columns are likely to be correlated: [('col_1', 'col_2'), ('col_1', 'col_3'), ('col_1', 'col_4'), ('col_1', 'col_5'), ('col_2', 'col_3'), ('col_2', 'col_4'), ('col_2', 'col_5'), ('col_3', 'col_4'), ('col_3', 'col_5'), ('col_4', 'col_5')]", - data_check_name=multi_data_check_name, - message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, - details={'columns': [('col_1', 'col_2'), ('col_1', 'col_3'), ('col_1', 'col_4'), ('col_1', 'col_5'), - ('col_2', 'col_3'), ('col_2', 'col_4'), ('col_2', 'col_5'), - ('col_3', 'col_4'), ('col_3', 'col_5'), ('col_4', 'col_5')]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Columns are likely to be correlated: [('col_1', 'col_2'), ('col_1', 'col_3'), ('col_1', 'col_4'), ('col_1', 'col_5'), ('col_2', 'col_3'), ('col_2', 'col_4'), ('col_2', 'col_5'), ('col_3', 'col_4'), ('col_3', 'col_5'), ('col_4', 'col_5')]", + data_check_name=multi_data_check_name, + message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, + details={ + "columns": [ + ("col_1", "col_2"), + ("col_1", "col_3"), + ("col_1", "col_4"), + ("col_1", "col_5"), + ("col_2", "col_3"), + ("col_2", "col_4"), + ("col_2", "col_5"), + ("col_3", "col_4"), + ("col_3", "col_5"), + ("col_4", "col_5"), + ] + }, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } -@pytest.mark.parametrize("data_type", ['pd', 'ww']) +@pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_multicollinearity_nonnumeric_cols(data_type, make_data_type): - X = pd.DataFrame({'col_1': ["a", "b", "c", "d", "a"], - 'col_2': ["w", "x", "y", "z", "b"], - 'col_3': ["a", "a", "c", "d", "a"], - 'col_4': ["a", "b", "c", "d", "a"], - 'col_5': ["0", "0", "1", "2", "0"], - 'col_6': [1, 1, 2, 3, 1] - }) + X = pd.DataFrame( + { + "col_1": ["a", "b", "c", "d", "a"], + "col_2": ["w", "x", "y", "z", "b"], + "col_3": ["a", "a", "c", "d", "a"], + "col_4": ["a", "b", "c", "d", "a"], + "col_5": ["0", "0", "1", "2", "0"], + "col_6": [1, 1, 2, 3, 1], + } + ) X = make_data_type(data_type, X) multi_check = MulticollinearityDataCheck(threshold=0.9) assert multi_check.validate(X) == { - "warnings": [DataCheckWarning(message="Columns are likely to be correlated: [('col_1', 'col_4'), ('col_3', 'col_5'), ('col_3', 'col_6'), ('col_5', 'col_6'), ('col_1', 'col_2'), ('col_2', 'col_4')]", - data_check_name=multi_data_check_name, - message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, - details={'columns': [('col_1', 'col_4'), ('col_3', 'col_5'), ('col_3', 'col_6'), ('col_5', 'col_6'), ('col_1', 'col_2'), ('col_2', 'col_4')]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Columns are likely to be correlated: [('col_1', 'col_4'), ('col_3', 'col_5'), ('col_3', 'col_6'), ('col_5', 'col_6'), ('col_1', 'col_2'), ('col_2', 'col_4')]", + data_check_name=multi_data_check_name, + message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, + details={ + "columns": [ + ("col_1", "col_4"), + ("col_3", "col_5"), + ("col_3", "col_6"), + ("col_5", "col_6"), + ("col_1", "col_2"), + ("col_2", "col_4"), + ] + }, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } @@ -76,4 +115,8 @@ def test_multicollinearity_data_check_input_formats(): multi_check = MulticollinearityDataCheck(threshold=0.9) # test empty pd.DataFrame - assert multi_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} + assert multi_check.validate(pd.DataFrame()) == { + "warnings": [], + "errors": [], + "actions": [], + } diff --git a/evalml/tests/data_checks_tests/test_natural_language_nan_data_check.py b/evalml/tests/data_checks_tests/test_natural_language_nan_data_check.py index 89aa89698f..6ed3880669 100644 --- a/evalml/tests/data_checks_tests/test_natural_language_nan_data_check.py +++ b/evalml/tests/data_checks_tests/test_natural_language_nan_data_check.py @@ -4,75 +4,118 @@ from evalml.data_checks import ( DataCheckError, DataCheckMessageCode, - NaturalLanguageNaNDataCheck + NaturalLanguageNaNDataCheck, ) def test_nl_nan_data_check_error(): - data = pd.DataFrame({'natural_language': [None, "string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language"]}) + data = pd.DataFrame( + { + "natural_language": [ + None, + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] + } + ) nl_nan_check = NaturalLanguageNaNDataCheck() assert nl_nan_check.validate(data) == { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input natural language column(s) (natural_language) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=NaturalLanguageNaNDataCheck.name, - message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, - details={"columns": 'natural_language'}).to_dict()] + "errors": [ + DataCheckError( + message="Input natural language column(s) (natural_language) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=NaturalLanguageNaNDataCheck.name, + message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, + details={"columns": "natural_language"}, + ).to_dict() + ], } def test_nl_nan_data_check_error_no_nan(): nl_nan_check = NaturalLanguageNaNDataCheck() - assert nl_nan_check.validate(pd.DataFrame({'natural_language': ["string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language"]})) == { - "warnings": [], - "actions": [], - "errors": [] - } + assert ( + nl_nan_check.validate( + pd.DataFrame( + { + "natural_language": [ + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] + } + ) + ) + == {"warnings": [], "actions": [], "errors": []} + ) def test_nl_nan_data_check_error_other_cols_with_nan(): data = pd.DataFrame(np.random.randint(0, 10, size=(2, 2))) - data['A'] = ['string_that_is_long_enough_for_natural_language', 'string_that_is_long_enough_for_natural_language'] + data["A"] = [ + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] data = data.replace(data.iloc[0][0], None) data = data.replace(data.iloc[1][1], None) nl_nan_check = NaturalLanguageNaNDataCheck() - assert nl_nan_check.validate(data) == { - "warnings": [], - "actions": [], - "errors": [] - } + assert nl_nan_check.validate(data) == {"warnings": [], "actions": [], "errors": []} def test_nl_nan_data_check_error_multiple_nl_no_nan(): data = pd.DataFrame() - data['A'] = ['string_that_is_long_enough_for_natural_language', 'string_that_is_long_enough_for_natural_language'] - data['B'] = ['string_that_is_long_enough_for_natural_language', 'string_that_is_long_enough_for_natural_language'] + data["A"] = [ + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] + data["B"] = [ + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] - data['C'] = np.random.randint(0, 3, size=len(data)) + data["C"] = np.random.randint(0, 3, size=len(data)) nl_nan_check = NaturalLanguageNaNDataCheck() - assert nl_nan_check.validate(data) == { - "warnings": [], - "actions": [], - "errors": [] - } + assert nl_nan_check.validate(data) == {"warnings": [], "actions": [], "errors": []} def test_nl_nan_data_check_error_multiple_nl_nan(): data = pd.DataFrame() - data['A'] = pd.Series([None, "string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language"]) - data['B'] = pd.Series([None, "string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language"]) - data['C'] = pd.Series(["", "string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language"]) - data['D'] = np.random.randint(0, 5, size=len(data)) + data["A"] = pd.Series( + [ + None, + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] + ) + data["B"] = pd.Series( + [ + None, + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] + ) + data["C"] = pd.Series( + [ + "", + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] + ) + data["D"] = np.random.randint(0, 5, size=len(data)) nl_nan_check = NaturalLanguageNaNDataCheck() assert nl_nan_check.validate(data) == { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input natural language column(s) (A, B) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=NaturalLanguageNaNDataCheck.name, - message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, - details={"columns": 'A, B'}).to_dict()] + "errors": [ + DataCheckError( + message="Input natural language column(s) (A, B) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=NaturalLanguageNaNDataCheck.name, + message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, + details={"columns": "A, B"}, + ).to_dict() + ], } @@ -80,35 +123,55 @@ def test_nl_nan_check_input_formats(): nl_nan_check = NaturalLanguageNaNDataCheck() # test empty pd.DataFrame - assert nl_nan_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} + assert nl_nan_check.validate(pd.DataFrame()) == { + "warnings": [], + "errors": [], + "actions": [], + } expected = { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input natural language column(s) (nl) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=NaturalLanguageNaNDataCheck.name, - message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, - details={"columns": 'nl'}).to_dict()] + "errors": [ + DataCheckError( + message="Input natural language column(s) (nl) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=NaturalLanguageNaNDataCheck.name, + message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, + details={"columns": "nl"}, + ).to_dict() + ], } - nl_col = [None, "string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language"] + nl_col = [ + None, + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] # test Woodwork - ww_input = pd.DataFrame(nl_col, columns=['nl']) - ww_input.ww.init(logical_types={'nl': 'NaturalLanguage'}) + ww_input = pd.DataFrame(nl_col, columns=["nl"]) + ww_input.ww.init(logical_types={"nl": "NaturalLanguage"}) assert nl_nan_check.validate(ww_input) == expected expected = { "warnings": [], "actions": [], - "errors": [DataCheckError(message='Input natural language column(s) (0) contains NaN values. Please impute NaN values or drop these rows or columns.', - data_check_name=NaturalLanguageNaNDataCheck.name, - message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, - details={'columns': '0'}).to_dict()] + "errors": [ + DataCheckError( + message="Input natural language column(s) (0) contains NaN values. Please impute NaN values or drop these rows or columns.", + data_check_name=NaturalLanguageNaNDataCheck.name, + message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, + details={"columns": "0"}, + ).to_dict() + ], } # test 2D list - nl_col_without_nan = ["string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language", "string_that_is_long_enough_for_natural_language"] + nl_col_without_nan = [ + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + "string_that_is_long_enough_for_natural_language", + ] assert nl_nan_check.validate([nl_col, nl_col_without_nan]) == expected # test np.array diff --git a/evalml/tests/data_checks_tests/test_no_variance_data_check.py b/evalml/tests/data_checks_tests/test_no_variance_data_check.py index a64201cbcb..05f0733319 100644 --- a/evalml/tests/data_checks_tests/test_no_variance_data_check.py +++ b/evalml/tests/data_checks_tests/test_no_variance_data_check.py @@ -8,16 +8,16 @@ DataCheckError, DataCheckMessageCode, DataCheckWarning, - NoVarianceDataCheck + NoVarianceDataCheck, ) no_variance_data_check_name = NoVarianceDataCheck.name all_distinct_X = pd.DataFrame({"feature": [1, 2, 3, 4]}) -all_null_X = pd.DataFrame({"feature": [None] * 4, - "feature_2": list(range(4))}) -two_distinct_with_nulls_X = pd.DataFrame({"feature": [1, 1, None, None], - "feature_2": list(range(4))}) +all_null_X = pd.DataFrame({"feature": [None] * 4, "feature_2": list(range(4))}) +two_distinct_with_nulls_X = pd.DataFrame( + {"feature": [1, 1, None, None], "feature_2": list(range(4))} +) two_distinct_with_nulls_X_ww = two_distinct_with_nulls_X.copy() two_distinct_with_nulls_X_ww.ww.init() @@ -29,71 +29,188 @@ all_null_y_with_name = pd.Series([None] * 4) all_null_y_with_name.name = "Labels" -feature_0_unique = DataCheckError(message="feature has 0 unique value.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "feature"}).to_dict() -feature_1_unique = DataCheckError(message="feature has 1 unique value.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "feature"}).to_dict() -labels_0_unique = DataCheckError(message="Y has 0 unique value.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "Y"}).to_dict() -labels_1_unique = DataCheckError(message="Y has 1 unique value.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "Y"}).to_dict() -drop_feature_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": "feature"}).to_dict() - -cases = [(all_distinct_X, all_distinct_y, True, {"warnings": [], "errors": [], "actions": []}), - ([[1], [2], [3], [4]], [1, 2, 3, 2], False, {"warnings": [], "errors": [], "actions": []}), - (np.arange(12).reshape(4, 3), [1, 2, 3], True, {"warnings": [], "errors": [], "actions": []}), - (all_null_X, all_distinct_y, False, {"warnings": [], "errors": [feature_0_unique], "actions": [drop_feature_action]}), - (all_null_X, [1] * 4, False, {"warnings": [], "errors": [feature_0_unique, labels_1_unique], "actions": [drop_feature_action]}), - (all_null_X, all_distinct_y, True, {"warnings": [], "errors": [feature_1_unique], "actions": [drop_feature_action]}), - (all_distinct_X, all_null_y, True, {"warnings": [], "errors": [labels_1_unique], "actions": []}), - (all_distinct_X, all_null_y, False, {"warnings": [], "errors": [labels_0_unique], "actions": []}), - (two_distinct_with_nulls_X, two_distinct_with_nulls_y, True, - {"warnings": [DataCheckWarning(message="feature has two unique values including nulls. Consider encoding the nulls for " - "this column to be useful for machine learning.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, - details={"column": "feature"}).to_dict(), - DataCheckWarning(message="Y has two unique values including nulls. Consider encoding the nulls for " - "this column to be useful for machine learning.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, - details={"column": "Y"}).to_dict()], - "errors": [], - "actions": [drop_feature_action]}), - (two_distinct_with_nulls_X, two_distinct_with_nulls_y, False, {"warnings": [], "errors": [feature_1_unique, labels_1_unique], "actions": [drop_feature_action]}), - (all_distinct_X, all_null_y_with_name, False, {"warnings": [], - "errors": [DataCheckError(message="Labels has 0 unique value.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE, - details={"column": "Labels"}).to_dict()], - "actions": []}), - (two_distinct_with_nulls_X_ww, two_distinct_with_nulls_y_ww, True, - {"warnings": [DataCheckWarning(message="feature has two unique values including nulls. Consider encoding the nulls for " - "this column to be useful for machine learning.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, - details={"column": "feature"}).to_dict(), - DataCheckWarning(message="Y has two unique values including nulls. Consider encoding the nulls for " - "this column to be useful for machine learning.", - data_check_name=no_variance_data_check_name, - message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, - details={"column": "Y"}).to_dict()], - "errors": [], - "actions": [drop_feature_action]}), - (two_distinct_with_nulls_X, two_distinct_with_nulls_y, False, {"warnings": [], "errors": [feature_1_unique, labels_1_unique], "actions": [drop_feature_action]}), +feature_0_unique = DataCheckError( + message="feature has 0 unique value.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "feature"}, +).to_dict() +feature_1_unique = DataCheckError( + message="feature has 1 unique value.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "feature"}, +).to_dict() +labels_0_unique = DataCheckError( + message="Y has 0 unique value.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "Y"}, +).to_dict() +labels_1_unique = DataCheckError( + message="Y has 1 unique value.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "Y"}, +).to_dict() +drop_feature_action = DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "feature"} +).to_dict() - ] +cases = [ + ( + all_distinct_X, + all_distinct_y, + True, + {"warnings": [], "errors": [], "actions": []}, + ), + ( + [[1], [2], [3], [4]], + [1, 2, 3, 2], + False, + {"warnings": [], "errors": [], "actions": []}, + ), + ( + np.arange(12).reshape(4, 3), + [1, 2, 3], + True, + {"warnings": [], "errors": [], "actions": []}, + ), + ( + all_null_X, + all_distinct_y, + False, + { + "warnings": [], + "errors": [feature_0_unique], + "actions": [drop_feature_action], + }, + ), + ( + all_null_X, + [1] * 4, + False, + { + "warnings": [], + "errors": [feature_0_unique, labels_1_unique], + "actions": [drop_feature_action], + }, + ), + ( + all_null_X, + all_distinct_y, + True, + { + "warnings": [], + "errors": [feature_1_unique], + "actions": [drop_feature_action], + }, + ), + ( + all_distinct_X, + all_null_y, + True, + {"warnings": [], "errors": [labels_1_unique], "actions": []}, + ), + ( + all_distinct_X, + all_null_y, + False, + {"warnings": [], "errors": [labels_0_unique], "actions": []}, + ), + ( + two_distinct_with_nulls_X, + two_distinct_with_nulls_y, + True, + { + "warnings": [ + DataCheckWarning( + message="feature has two unique values including nulls. Consider encoding the nulls for " + "this column to be useful for machine learning.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, + details={"column": "feature"}, + ).to_dict(), + DataCheckWarning( + message="Y has two unique values including nulls. Consider encoding the nulls for " + "this column to be useful for machine learning.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, + details={"column": "Y"}, + ).to_dict(), + ], + "errors": [], + "actions": [drop_feature_action], + }, + ), + ( + two_distinct_with_nulls_X, + two_distinct_with_nulls_y, + False, + { + "warnings": [], + "errors": [feature_1_unique, labels_1_unique], + "actions": [drop_feature_action], + }, + ), + ( + all_distinct_X, + all_null_y_with_name, + False, + { + "warnings": [], + "errors": [ + DataCheckError( + message="Labels has 0 unique value.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "Labels"}, + ).to_dict() + ], + "actions": [], + }, + ), + ( + two_distinct_with_nulls_X_ww, + two_distinct_with_nulls_y_ww, + True, + { + "warnings": [ + DataCheckWarning( + message="feature has two unique values including nulls. Consider encoding the nulls for " + "this column to be useful for machine learning.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, + details={"column": "feature"}, + ).to_dict(), + DataCheckWarning( + message="Y has two unique values including nulls. Consider encoding the nulls for " + "this column to be useful for machine learning.", + data_check_name=no_variance_data_check_name, + message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, + details={"column": "Y"}, + ).to_dict(), + ], + "errors": [], + "actions": [drop_feature_action], + }, + ), + ( + two_distinct_with_nulls_X, + two_distinct_with_nulls_y, + False, + { + "warnings": [], + "errors": [feature_1_unique, labels_1_unique], + "actions": [drop_feature_action], + }, + ), +] @pytest.mark.parametrize("X, y, count_nan_as_value, expected_validation_result", cases) -def test_no_variance_data_check_warnings(X, y, count_nan_as_value, expected_validation_result): +def test_no_variance_data_check_warnings( + X, y, count_nan_as_value, expected_validation_result +): check = NoVarianceDataCheck(count_nan_as_value) assert check.validate(X, y) == expected_validation_result diff --git a/evalml/tests/data_checks_tests/test_outliers_data_check.py b/evalml/tests/data_checks_tests/test_outliers_data_check.py index 53193ef5f4..a40ec4acc5 100644 --- a/evalml/tests/data_checks_tests/test_outliers_data_check.py +++ b/evalml/tests/data_checks_tests/test_outliers_data_check.py @@ -3,11 +3,7 @@ import numpy as np import pandas as pd -from evalml.data_checks import ( - DataCheckMessageCode, - DataCheckWarning, - OutliersDataCheck -) +from evalml.data_checks import DataCheckMessageCode, DataCheckWarning, OutliersDataCheck outliers_data_check_name = OutliersDataCheck.name @@ -21,16 +17,20 @@ def test_outliers_data_check_warnings(): X.iloc[3, 25] = 1000 X.iloc[5, 55] = 10000 X.iloc[10, 72] = -1000 - X.iloc[:, 90] = 'string_values' + X.iloc[:, 90] = "string_values" outliers_check = OutliersDataCheck() assert outliers_check.validate(X) == { - "warnings": [DataCheckWarning(message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", - data_check_name=outliers_data_check_name, - message_code=DataCheckMessageCode.HAS_OUTLIERS, - details={"columns": [3, 25, 55, 72]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", + data_check_name=outliers_data_check_name, + message_code=DataCheckMessageCode.HAS_OUTLIERS, + details={"columns": [3, 25, 55, 72]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } @@ -38,7 +38,11 @@ def test_outliers_data_check_input_formats(): outliers_check = OutliersDataCheck() # test empty pd.DataFrame - assert outliers_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} + assert outliers_check.validate(pd.DataFrame()) == { + "warnings": [], + "errors": [], + "actions": [], + } # test np.array a = np.arange(10) * 0.01 @@ -52,24 +56,32 @@ def test_outliers_data_check_input_formats(): outliers_check = OutliersDataCheck() assert outliers_check.validate(X.to_numpy()) == { - "warnings": [DataCheckWarning(message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", - data_check_name=outliers_data_check_name, - message_code=DataCheckMessageCode.HAS_OUTLIERS, - details={"columns": [3, 25, 55, 72]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", + data_check_name=outliers_data_check_name, + message_code=DataCheckMessageCode.HAS_OUTLIERS, + details={"columns": [3, 25, 55, 72]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } # test Woodwork outliers_check = OutliersDataCheck() X.ww.init() assert outliers_check.validate(X) == { - "warnings": [DataCheckWarning(message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", - data_check_name=outliers_data_check_name, - message_code=DataCheckMessageCode.HAS_OUTLIERS, - details={"columns": [3, 25, 55, 72]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", + data_check_name=outliers_data_check_name, + message_code=DataCheckMessageCode.HAS_OUTLIERS, + details={"columns": [3, 25, 55, 72]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } @@ -78,17 +90,23 @@ def test_outliers_data_check_string_cols(): data = np.tile(a, (100, 2)) n_cols = 20 - X = pd.DataFrame(data=data, columns=[string.ascii_lowercase[i] for i in range(n_cols)]) + X = pd.DataFrame( + data=data, columns=[string.ascii_lowercase[i] for i in range(n_cols)] + ) X.iloc[0, 3] = 1000 outliers_check = OutliersDataCheck() assert outliers_check.validate(X) == { - "warnings": [DataCheckWarning(message="Column(s) 'd' are likely to have outlier data.", - data_check_name=outliers_data_check_name, - message_code=DataCheckMessageCode.HAS_OUTLIERS, - details={"columns": ["d"]}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column(s) 'd' are likely to have outlier data.", + data_check_name=outliers_data_check_name, + message_code=DataCheckMessageCode.HAS_OUTLIERS, + details={"columns": ["d"]}, + ).to_dict() + ], "errors": [], - "actions": [] + "actions": [], } @@ -105,22 +123,32 @@ def test_outlier_score(): for col in X.columns: results = OutliersDataCheck._outlier_score(X[col], convert_column=False) if col in [3, 25, 55, 72]: - assert results['score'] != 1.0 - assert len(results['values']['high_values']) != 0 or len(results['values']['low_values']) != 0 + assert results["score"] != 1.0 + assert ( + len(results["values"]["high_values"]) != 0 + or len(results["values"]["low_values"]) != 0 + ) else: - assert results['score'] == 1.0 - assert len(results['values']['high_values']) == 0 and len(results['values']['low_values']) == 0 + assert results["score"] == 1.0 + assert ( + len(results["values"]["high_values"]) == 0 + and len(results["values"]["low_values"]) == 0 + ) def test_outlier_score_convert_column_to_int(): has_outlier = pd.Series(np.append(np.arange(10), 1000)).astype(object) results = OutliersDataCheck._outlier_score(has_outlier, convert_column=True) - assert results['score'] != 1.0 - len(results['values']['high_values']) != 0 or len(results['values']['low_values']) != 0 + assert results["score"] != 1.0 + len(results["values"]["high_values"]) != 0 or len( + results["values"]["low_values"] + ) != 0 no_outlier = pd.Series(np.arange(10)).astype(object) results = OutliersDataCheck._outlier_score(no_outlier, convert_column=True) - assert results['score'] == 1.0 - len(results['values']['high_values']) == 0 and len(results['values']['low_values']) == 0 + assert results["score"] == 1.0 + len(results["values"]["high_values"]) == 0 and len( + results["values"]["low_values"] + ) == 0 def test_outlier_score_all_nan(): diff --git a/evalml/tests/data_checks_tests/test_sparsity_data_check.py b/evalml/tests/data_checks_tests/test_sparsity_data_check.py index a41c2cf826..89d48af3a8 100644 --- a/evalml/tests/data_checks_tests/test_sparsity_data_check.py +++ b/evalml/tests/data_checks_tests/test_sparsity_data_check.py @@ -6,7 +6,7 @@ DataCheckActionCode, DataCheckMessageCode, DataCheckWarning, - SparsityDataCheck + SparsityDataCheck, ) sparsity_data_check_name = SparsityDataCheck.name @@ -20,27 +20,45 @@ def test_sparsity_data_check_init(): sparsity_check = SparsityDataCheck("multiclass", threshold=0.2) assert sparsity_check.unique_count_threshold == 10 - sparsity_check = SparsityDataCheck("multiclass", threshold=.1, unique_count_threshold=5) + sparsity_check = SparsityDataCheck( + "multiclass", threshold=0.1, unique_count_threshold=5 + ) assert sparsity_check.unique_count_threshold == 5 - with pytest.raises(ValueError, match="Threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="Threshold must be a float between 0 and 1, inclusive." + ): SparsityDataCheck("multiclass", threshold=-0.1) - with pytest.raises(ValueError, match="Threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="Threshold must be a float between 0 and 1, inclusive." + ): SparsityDataCheck("multiclass", threshold=1.1) - with pytest.raises(ValueError, match="Sparsity is only defined for multiclass problem types."): - SparsityDataCheck("binary", threshold=.5) - with pytest.raises(ValueError, match="Sparsity is only defined for multiclass problem types."): - SparsityDataCheck("time series binary", threshold=.5) - with pytest.raises(ValueError, match="Sparsity is only defined for multiclass problem types."): - SparsityDataCheck("regression", threshold=.5) - with pytest.raises(ValueError, match="Sparsity is only defined for multiclass problem types."): - SparsityDataCheck("time series regression", threshold=.5) - - with pytest.raises(ValueError, match="Unique count threshold must be positive integer."): - SparsityDataCheck("multiclass", threshold=.5, unique_count_threshold=-1) - with pytest.raises(ValueError, match="Unique count threshold must be positive integer."): - SparsityDataCheck("multiclass", threshold=.5, unique_count_threshold=2.3) + with pytest.raises( + ValueError, match="Sparsity is only defined for multiclass problem types." + ): + SparsityDataCheck("binary", threshold=0.5) + with pytest.raises( + ValueError, match="Sparsity is only defined for multiclass problem types." + ): + SparsityDataCheck("time series binary", threshold=0.5) + with pytest.raises( + ValueError, match="Sparsity is only defined for multiclass problem types." + ): + SparsityDataCheck("regression", threshold=0.5) + with pytest.raises( + ValueError, match="Sparsity is only defined for multiclass problem types." + ): + SparsityDataCheck("time series regression", threshold=0.5) + + with pytest.raises( + ValueError, match="Unique count threshold must be positive integer." + ): + SparsityDataCheck("multiclass", threshold=0.5, unique_count_threshold=-1) + with pytest.raises( + ValueError, match="Unique count threshold must be positive integer." + ): + SparsityDataCheck("multiclass", threshold=0.5, unique_count_threshold=2.3) def test_sparsity_data_check_sparsity_score(): @@ -63,53 +81,75 @@ def test_sparsity_data_check_sparsity_score(): assert scores == 0 # Application to an entire DataFrame - data = pd.DataFrame({'most_sparse': [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] - 'more_sparse': [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] - 'sparse': [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] - 'less_sparse': [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] - 'not_sparse': [float(1) for x in range(10)]}) # [1,1,1,1,1,1,1,1,1,1] + data = pd.DataFrame( + { + "most_sparse": [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] + "more_sparse": [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] + "sparse": [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] + "less_sparse": [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] + "not_sparse": [float(1) for x in range(10)], + } + ) # [1,1,1,1,1,1,1,1,1,1] sparsity_score = SparsityDataCheck.sparsity_score scores = data.apply(sparsity_score, count_threshold=3) - ans = pd.Series({'most_sparse': 0.000000, - 'more_sparse': 0.000000, - 'sparse': 0.333333, - 'less_sparse': 1.000000, - 'not_sparse': 1.000000}) + ans = pd.Series( + { + "most_sparse": 0.000000, + "more_sparse": 0.000000, + "sparse": 0.333333, + "less_sparse": 1.000000, + "not_sparse": 1.000000, + } + ) assert scores.round(6).equals(ans), "Sparsity DataFrame check failed." def test_sparsity_data_check_warnings(): - data = pd.DataFrame({'most_sparse': [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] - 'more_sparse': [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] - 'sparse': [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] - 'less_sparse': [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] - 'not_sparse': [float(1) for x in range(10)]}) # [1,1,1,1,1,1,1,1,1,1] - - sparsity_check = SparsityDataCheck(problem_type="multiclass", - threshold=.4, - unique_count_threshold=3) + data = pd.DataFrame( + { + "most_sparse": [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] + "more_sparse": [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] + "sparse": [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] + "less_sparse": [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] + "not_sparse": [float(1) for x in range(10)], + } + ) # [1,1,1,1,1,1,1,1,1,1] + + sparsity_check = SparsityDataCheck( + problem_type="multiclass", threshold=0.4, unique_count_threshold=3 + ) assert sparsity_check.validate(data) == { - "warnings": [DataCheckWarning( - message="Input columns (most_sparse) for multiclass problem type are too sparse.", - data_check_name=sparsity_data_check_name, - message_code=DataCheckMessageCode.TOO_SPARSE, - details={"column": "most_sparse", - 'sparsity_score': 0}).to_dict(), - DataCheckWarning( - message="Input columns (more_sparse) for multiclass problem type are too sparse.", - data_check_name=sparsity_data_check_name, - message_code=DataCheckMessageCode.TOO_SPARSE, - details={"column": "more_sparse", - 'sparsity_score': 0}).to_dict(), - DataCheckWarning( - message="Input columns (sparse) for multiclass problem type are too sparse.", - data_check_name=sparsity_data_check_name, - message_code=DataCheckMessageCode.TOO_SPARSE, - details={"column": "sparse", - 'sparsity_score': 0.3333333333333333}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Input columns (most_sparse) for multiclass problem type are too sparse.", + data_check_name=sparsity_data_check_name, + message_code=DataCheckMessageCode.TOO_SPARSE, + details={"column": "most_sparse", "sparsity_score": 0}, + ).to_dict(), + DataCheckWarning( + message="Input columns (more_sparse) for multiclass problem type are too sparse.", + data_check_name=sparsity_data_check_name, + message_code=DataCheckMessageCode.TOO_SPARSE, + details={"column": "more_sparse", "sparsity_score": 0}, + ).to_dict(), + DataCheckWarning( + message="Input columns (sparse) for multiclass problem type are too sparse.", + data_check_name=sparsity_data_check_name, + message_code=DataCheckMessageCode.TOO_SPARSE, + details={"column": "sparse", "sparsity_score": 0.3333333333333333}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'most_sparse'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'more_sparse'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'sparse'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "most_sparse"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "more_sparse"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "sparse"} + ).to_dict(), + ], } diff --git a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py index 6f70c2f406..691e171dde 100644 --- a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py +++ b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py @@ -9,7 +9,7 @@ DataCheckActionCode, DataCheckMessageCode, DataCheckWarning, - TargetLeakageDataCheck + TargetLeakageDataCheck, ) target_leakage_data_check_name = TargetLeakageDataCheck.name @@ -28,15 +28,21 @@ def test_target_leakage_data_check_init(): target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=1.0) assert target_leakage_check.pct_corr_threshold == 1.0 - with pytest.raises(ValueError, match="pct_corr_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, + match="pct_corr_threshold must be a float between 0 and 1, inclusive.", + ): TargetLeakageDataCheck(pct_corr_threshold=-0.1) - with pytest.raises(ValueError, match="pct_corr_threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, + match="pct_corr_threshold must be a float between 0 and 1, inclusive.", + ): TargetLeakageDataCheck(pct_corr_threshold=1.1) with pytest.raises(ValueError, match="Method 'MUTUAL' not in"): - TargetLeakageDataCheck(method='MUTUAL') + TargetLeakageDataCheck(method="MUTUAL") with pytest.raises(ValueError, match="Method 'person' not in"): - TargetLeakageDataCheck(method='person') + TargetLeakageDataCheck(method="person") def test_target_leakage_data_check_warnings(): @@ -51,35 +57,55 @@ def test_target_leakage_data_check_warnings(): leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5) assert leakage_check.validate(X, y) == { - "warnings": [DataCheckWarning(message="Column 'a' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "a"}).to_dict(), - DataCheckWarning(message="Column 'b' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "b"}).to_dict(), - DataCheckWarning(message="Column 'c' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "c"}).to_dict(), - DataCheckWarning(message="Column 'd' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "d"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'a' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "a"}, + ).to_dict(), + DataCheckWarning( + message="Column 'b' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "b"}, + ).to_dict(), + DataCheckWarning( + message="Column 'c' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "c"}, + ).to_dict(), + DataCheckWarning( + message="Column 'd' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "d"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'a'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'b'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'c'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'd'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "a"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "b"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "c"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "d"} + ).to_dict(), + ], } -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_target_leakage_data_check_empty(data_type, make_data_type): X = make_data_type(data_type, pd.DataFrame()) y = make_data_type(data_type, pd.Series()) - leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8, method='mutual') + leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8, method="mutual") assert leakage_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} @@ -95,27 +121,47 @@ def test_target_leakage_data_check_input_formats(): y = y.astype(bool) expected = { - "warnings": [DataCheckWarning(message="Column 'a' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "a"}).to_dict(), - DataCheckWarning(message="Column 'b' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "b"}).to_dict(), - DataCheckWarning(message="Column 'c' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "c"}).to_dict(), - DataCheckWarning(message="Column 'd' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "d"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'a' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "a"}, + ).to_dict(), + DataCheckWarning( + message="Column 'b' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "b"}, + ).to_dict(), + DataCheckWarning( + message="Column 'c' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "c"}, + ).to_dict(), + DataCheckWarning( + message="Column 'd' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "d"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'a'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'b'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'c'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'd'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "a"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "b"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "c"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "d"} + ).to_dict(), + ], } # test X, y with ww X_ww = X.copy() @@ -128,27 +174,47 @@ def test_target_leakage_data_check_input_formats(): # test X as np.array assert leakage_check.validate(X.to_numpy().astype(float), y) == { - "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 0}).to_dict(), - DataCheckWarning(message="Column '1' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 1}).to_dict(), - DataCheckWarning(message="Column '2' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 2}).to_dict(), - DataCheckWarning(message="Column '3' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 3}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column '0' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 0}, + ).to_dict(), + DataCheckWarning( + message="Column '1' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 1}, + ).to_dict(), + DataCheckWarning( + message="Column '2' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 2}, + ).to_dict(), + DataCheckWarning( + message="Column '3' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 3}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 2}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 3}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 0} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 1} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 2} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 3} + ).to_dict(), + ], } @@ -160,11 +226,7 @@ def test_target_leakage_none(): X["b"] = [0, 0, 0, 0] y = y.astype(bool) - expected = { - "warnings": [], - "errors": [], - "actions": [] - } + expected = {"warnings": [], "errors": [], "actions": []} assert leakage_check.validate(X, y) == expected @@ -176,33 +238,58 @@ def test_target_leakage_types(): X = pd.DataFrame() X["a"] = ["a", "b", "a", "a"] X["b"] = y - 1 - X["c"] = [datetime.strptime("2015", "%Y"), datetime.strptime("2016", "%Y"), datetime.strptime("2015", "%Y"), datetime.strptime("2015", "%Y")] + X["c"] = [ + datetime.strptime("2015", "%Y"), + datetime.strptime("2016", "%Y"), + datetime.strptime("2015", "%Y"), + datetime.strptime("2015", "%Y"), + ] X["d"] = ~y X["e"] = [0, 0, 0, 0] y = y.astype(bool) expected = { - "warnings": [DataCheckWarning(message="Column 'a' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "a"}).to_dict(), - DataCheckWarning(message="Column 'b' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "b"}).to_dict(), - DataCheckWarning(message="Column 'c' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "c"}).to_dict(), - DataCheckWarning(message="Column 'd' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "d"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'a' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "a"}, + ).to_dict(), + DataCheckWarning( + message="Column 'b' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "b"}, + ).to_dict(), + DataCheckWarning( + message="Column 'c' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "c"}, + ).to_dict(), + DataCheckWarning( + message="Column 'd' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "d"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'a'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'b'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'c'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'd'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "a"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "b"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "c"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "d"} + ).to_dict(), + ], } assert leakage_check.validate(X, y) == expected @@ -212,7 +299,11 @@ def test_target_leakage_multi(): leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8) # test empty pd.DataFrame, empty pd.Series - assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {"warnings": [], "errors": [], "actions": []} + assert leakage_check.validate(pd.DataFrame(), pd.Series()) == { + "warnings": [], + "errors": [], + "actions": [], + } y = pd.Series([1, 0, 2, 1, 2, 0]) X = pd.DataFrame() @@ -223,22 +314,38 @@ def test_target_leakage_multi(): X["e"] = ["a", "b", "c", "a", "b", "c"] expected = { - "warnings": [DataCheckWarning(message="Column 'a' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "a"}).to_dict(), - DataCheckWarning(message="Column 'b' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "b"}).to_dict(), - DataCheckWarning(message="Column 'c' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "c"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'a' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "a"}, + ).to_dict(), + DataCheckWarning( + message="Column 'b' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "b"}, + ).to_dict(), + DataCheckWarning( + message="Column 'c' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "c"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'a'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'b'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'c'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "a"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "b"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "c"} + ).to_dict(), + ], } # test X, y with ww @@ -255,9 +362,15 @@ def test_target_leakage_regression(): leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8) # test empty pd.DataFrame, empty pd.Series - assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {"warnings": [], "errors": [], "actions": []} + assert leakage_check.validate(pd.DataFrame(), pd.Series()) == { + "warnings": [], + "errors": [], + "actions": [], + } - y = pd.Series([0.4, 0.1, 2.3, 4.3, 2.2, 1.8, 3.7, 3.6, 2.4, 0.9, 3.1, 2.8, 4.1, 1.6, 1.2]) + y = pd.Series( + [0.4, 0.1, 2.3, 4.3, 2.2, 1.8, 3.7, 3.6, 2.4, 0.9, 3.1, 2.8, 4.1, 1.6, 1.2] + ) X = pd.DataFrame() X["a"] = y * 3 X["b"] = y - 1 @@ -266,27 +379,47 @@ def test_target_leakage_regression(): X["e"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"] expected = { - "warnings": [DataCheckWarning(message="Column 'a' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "a"}).to_dict(), - DataCheckWarning(message="Column 'b' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "b"}).to_dict(), - DataCheckWarning(message="Column 'c' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "c"}).to_dict(), - DataCheckWarning(message="Column 'e' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "e"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'a' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "a"}, + ).to_dict(), + DataCheckWarning( + message="Column 'b' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "b"}, + ).to_dict(), + DataCheckWarning( + message="Column 'c' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "c"}, + ).to_dict(), + DataCheckWarning( + message="Column 'e' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "e"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'a'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'b'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'c'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'e'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "a"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "b"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "c"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "e"} + ).to_dict(), + ], } # test X, y with ww @@ -309,45 +442,65 @@ def test_target_leakage_data_check_warnings_pearson(): X["e"] = [0, 0, 0, 0] y = y.astype(bool) - leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method='pearson') + leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method="pearson") assert leakage_check.validate(X, y) == { - "warnings": [DataCheckWarning(message="Column 'a' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "a"}).to_dict(), - DataCheckWarning(message="Column 'b' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "b"}).to_dict(), - DataCheckWarning(message="Column 'c' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "c"}).to_dict(), - DataCheckWarning(message="Column 'd' is 50.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "d"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'a' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "a"}, + ).to_dict(), + DataCheckWarning( + message="Column 'b' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "b"}, + ).to_dict(), + DataCheckWarning( + message="Column 'c' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "c"}, + ).to_dict(), + DataCheckWarning( + message="Column 'd' is 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "d"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'a'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'b'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'c'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'd'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "a"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "b"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "c"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "d"} + ).to_dict(), + ], } y = ["a", "b", "a", "a"] - leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method='pearson') - assert leakage_check.validate(X, y) == { - "warnings": [], - "errors": [], - "actions": [] - } + leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method="pearson") + assert leakage_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []} def test_target_leakage_data_check_input_formats_pearson(): - leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8, method='pearson') + leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8, method="pearson") # test empty pd.DataFrame, empty pd.Series - assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {"warnings": [], "errors": [], "actions": []} + assert leakage_check.validate(pd.DataFrame(), pd.Series()) == { + "warnings": [], + "errors": [], + "actions": [], + } y = pd.Series([1, 0, 1, 1]) X = pd.DataFrame() @@ -359,52 +512,92 @@ def test_target_leakage_data_check_input_formats_pearson(): y = y.astype(bool) expected = { - "warnings": [DataCheckWarning(message="Column 'a' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "a"}).to_dict(), - DataCheckWarning(message="Column 'b' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "b"}).to_dict(), - DataCheckWarning(message="Column 'c' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "c"}).to_dict(), - DataCheckWarning(message="Column 'd' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "d"}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column 'a' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "a"}, + ).to_dict(), + DataCheckWarning( + message="Column 'b' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "b"}, + ).to_dict(), + DataCheckWarning( + message="Column 'c' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "c"}, + ).to_dict(), + DataCheckWarning( + message="Column 'd' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "d"}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'a'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'b'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'c'}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'd'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "a"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "b"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "c"} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "d"} + ).to_dict(), + ], } # test X as np.array assert leakage_check.validate(X.values, y) == { - "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 0}).to_dict(), - DataCheckWarning(message="Column '1' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 1}).to_dict(), - DataCheckWarning(message="Column '2' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 2}).to_dict(), - DataCheckWarning(message="Column '3' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": 3}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Column '0' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 0}, + ).to_dict(), + DataCheckWarning( + message="Column '1' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 1}, + ).to_dict(), + DataCheckWarning( + message="Column '2' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 2}, + ).to_dict(), + DataCheckWarning( + message="Column '3' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": 3}, + ).to_dict(), + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 2}).to_dict(), - DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 3}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 0} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 1} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 2} + ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": 3} + ).to_dict(), + ], } # test X, y with ww @@ -418,17 +611,13 @@ def test_target_leakage_data_check_input_formats_pearson(): def test_target_leakage_none_pearson(): - leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8, method='pearson') + leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8, method="pearson") y = pd.Series([1, 0, 1, 1]) X = pd.DataFrame() X["a"] = [1, 1, 1, 1] X["b"] = [0, 0, 0, 0] y = y.astype(bool) - expected = { - "warnings": [], - "errors": [], - "actions": [] - } + expected = {"warnings": [], "errors": [], "actions": []} assert leakage_check.validate(X, y) == expected diff --git a/evalml/tests/data_checks_tests/test_uniqueness_data_check.py b/evalml/tests/data_checks_tests/test_uniqueness_data_check.py index dbdc84b3c2..07f0547bad 100644 --- a/evalml/tests/data_checks_tests/test_uniqueness_data_check.py +++ b/evalml/tests/data_checks_tests/test_uniqueness_data_check.py @@ -7,7 +7,7 @@ DataCheckActionCode, DataCheckMessageCode, DataCheckWarning, - UniquenessDataCheck + UniquenessDataCheck, ) uniqueness_data_check_name = UniquenessDataCheck.name @@ -26,9 +26,13 @@ def test_uniqueness_data_check_init(): uniqueness_check = UniquenessDataCheck("regression", threshold=1.0) assert uniqueness_check.threshold == 1.0 - with pytest.raises(ValueError, match="threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="threshold must be a float between 0 and 1, inclusive." + ): UniquenessDataCheck("regression", threshold=-0.1) - with pytest.raises(ValueError, match="threshold must be a float between 0 and 1, inclusive."): + with pytest.raises( + ValueError, match="threshold must be a float between 0 and 1, inclusive." + ): UniquenessDataCheck("regression", threshold=1.1) @@ -57,46 +61,82 @@ def test_uniqueness_data_check_uniqueness_score(): assert scores == ans # Test uniqueness in each column of a DataFrame - data = pd.DataFrame({'most_unique': [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] - 'more_unique': [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] - 'unique': [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] - 'less_unique': [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] - 'not_unique': [float(1) for x in range(10)]}) # [1,1,1,1,1,1,1,1,1,1] + data = pd.DataFrame( + { + "most_unique": [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] + "more_unique": [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] + "unique": [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] + "less_unique": [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] + "not_unique": [float(1) for x in range(10)], + } + ) # [1,1,1,1,1,1,1,1,1,1] scores = data.apply(uniqueness_score) - ans = pd.Series({'most_unique': 0.90, - 'more_unique': 0.80, - 'unique': 0.66, - 'less_unique': 0.50, - 'not_unique': 0.00}) + ans = pd.Series( + { + "most_unique": 0.90, + "more_unique": 0.80, + "unique": 0.66, + "less_unique": 0.50, + "not_unique": 0.00, + } + ) assert scores.round(7).equals(ans) def test_uniqueness_data_check_warnings(): - data = pd.DataFrame({'regression_unique_enough': [float(x) for x in range(100)], - 'regression_not_unique_enough': [float(1) for x in range(100)]}) + data = pd.DataFrame( + { + "regression_unique_enough": [float(x) for x in range(100)], + "regression_not_unique_enough": [float(1) for x in range(100)], + } + ) uniqueness_check = UniquenessDataCheck(problem_type="regression") assert uniqueness_check.validate(data) == { - "warnings": [DataCheckWarning( - message="Input columns (regression_not_unique_enough) for regression problem type are not unique enough.", - data_check_name=uniqueness_data_check_name, - message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, - details={"column": "regression_not_unique_enough", - 'uniqueness_score': 0.0}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Input columns (regression_not_unique_enough) for regression problem type are not unique enough.", + data_check_name=uniqueness_data_check_name, + message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, + details={ + "column": "regression_not_unique_enough", + "uniqueness_score": 0.0, + }, + ).to_dict() + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'regression_not_unique_enough'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, + metadata={"column": "regression_not_unique_enough"}, + ).to_dict() + ], } - data = pd.DataFrame({'multiclass_too_unique': ["Cats", "Are", "Absolutely", "The", "Best"] * 20, - 'multiclass_not_too_unique': ["Cats", "Cats", "Best", "Best", "Best"] * 20}) + data = pd.DataFrame( + { + "multiclass_too_unique": ["Cats", "Are", "Absolutely", "The", "Best"] * 20, + "multiclass_not_too_unique": ["Cats", "Cats", "Best", "Best", "Best"] * 20, + } + ) uniqueness_check = UniquenessDataCheck(problem_type="multiclass") assert uniqueness_check.validate(data) == { - "warnings": [DataCheckWarning( - message="Input columns (multiclass_too_unique) for multiclass problem type are too unique.", - data_check_name=uniqueness_data_check_name, - message_code=DataCheckMessageCode.TOO_UNIQUE, - details={"column": "multiclass_too_unique", - 'uniqueness_score': 0.7999999999999999}).to_dict()], + "warnings": [ + DataCheckWarning( + message="Input columns (multiclass_too_unique) for multiclass problem type are too unique.", + data_check_name=uniqueness_data_check_name, + message_code=DataCheckMessageCode.TOO_UNIQUE, + details={ + "column": "multiclass_too_unique", + "uniqueness_score": 0.7999999999999999, + }, + ).to_dict() + ], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'multiclass_too_unique'}).to_dict()] + "actions": [ + DataCheckAction( + DataCheckActionCode.DROP_COL, + metadata={"column": "multiclass_too_unique"}, + ).to_dict() + ], } diff --git a/evalml/tests/model_family_tests/test_model_family.py b/evalml/tests/model_family_tests/test_model_family.py index ddf78502c9..bc65ef08bc 100644 --- a/evalml/tests/model_family_tests/test_model_family.py +++ b/evalml/tests/model_family_tests/test_model_family.py @@ -5,24 +5,45 @@ @pytest.fixture def correct_model_families(): - correct_model_families = [ModelFamily.LINEAR_MODEL, ModelFamily.LIGHTGBM, - ModelFamily.RANDOM_FOREST, ModelFamily.XGBOOST, - ModelFamily.CATBOOST, ModelFamily.EXTRA_TREES, - ModelFamily.DECISION_TREE, ModelFamily.ENSEMBLE, - ModelFamily.BASELINE, ModelFamily.K_NEIGHBORS, - ModelFamily.SVM, ModelFamily.ARIMA, ModelFamily.NONE] + correct_model_families = [ + ModelFamily.LINEAR_MODEL, + ModelFamily.LIGHTGBM, + ModelFamily.RANDOM_FOREST, + ModelFamily.XGBOOST, + ModelFamily.CATBOOST, + ModelFamily.EXTRA_TREES, + ModelFamily.DECISION_TREE, + ModelFamily.ENSEMBLE, + ModelFamily.BASELINE, + ModelFamily.K_NEIGHBORS, + ModelFamily.SVM, + ModelFamily.ARIMA, + ModelFamily.NONE, + ] yield correct_model_families def test_handle_string(correct_model_families): - model_families = ['linear_model', 'lightgbm', 'random_forest', - 'xgboost', 'catboost', 'extra_trees', 'decision_tree', - 'ensemble', 'baseline', 'k_neighbors', 'svm', 'ARIMA', 'none'] + model_families = [ + "linear_model", + "lightgbm", + "random_forest", + "xgboost", + "catboost", + "extra_trees", + "decision_tree", + "ensemble", + "baseline", + "k_neighbors", + "svm", + "ARIMA", + "none", + ] for model_family in zip(model_families, correct_model_families): assert handle_model_family(model_family[0]) == model_family[1] - model_family = 'fake' - error_msg = 'Model family \'fake\' does not exist' + model_family = "fake" + error_msg = "Model family 'fake' does not exist" with pytest.raises(KeyError, match=error_msg): handle_model_family(model_family) == ModelFamily.LINEAR_MODEL @@ -33,7 +54,7 @@ def test_handle_model_family(correct_model_families): def test_handle_incorrect_type(): - error_msg = '`handle_model_family` was not passed a str or ModelFamily object' + error_msg = "`handle_model_family` was not passed a str or ModelFamily object" with pytest.raises(ValueError, match=error_msg): handle_model_family(5) diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py index c83055c350..f8e6256700 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_algorithms.py @@ -11,20 +11,20 @@ _aggregate_shap_values, _compute_shap_values, _create_dictionary, - _normalize_shap_values + _normalize_shap_values, ) from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline, - TimeSeriesRegressionPipeline + TimeSeriesRegressionPipeline, ) from evalml.pipelines.components import ( BaselineClassifier, BaselineRegressor, LinearRegressor, RandomForestClassifier, - TimeSeriesBaselineEstimator + TimeSeriesBaselineEstimator, ) from evalml.pipelines.components.utils import _all_estimators_used_in_search from evalml.pipelines.utils import make_pipeline @@ -42,7 +42,12 @@ class Pipeline(base_class): custom_name = estimator.name def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) return Pipeline @@ -52,18 +57,55 @@ def __init__(self, parameters, random_seed=0): data_message = "You must pass in a value for parameter 'training_data' when the pipeline does not have a tree-based estimator. Current estimator model family is Linear." -@pytest.mark.parametrize("pipeline,exception,match", [(make_test_pipeline(BaselineRegressor, RegressionPipeline), ValueError, baseline_message), - (make_test_pipeline(BaselineClassifier, BinaryClassificationPipeline), ValueError, baseline_message), - (make_test_pipeline(BaselineClassifier, MulticlassClassificationPipeline), ValueError, baseline_message), - (make_test_pipeline(TimeSeriesBaselineEstimator, TimeSeriesRegressionPipeline), ValueError, baseline_message), - (make_test_pipeline(RandomForestClassifier, BinaryClassificationPipeline), ValueError, datatype_message), - (make_test_pipeline(LinearRegressor, RegressionPipeline), ValueError, data_message)]) -@patch("evalml.model_understanding.prediction_explanations._algorithms.shap.TreeExplainer") +@pytest.mark.parametrize( + "pipeline,exception,match", + [ + ( + make_test_pipeline(BaselineRegressor, RegressionPipeline), + ValueError, + baseline_message, + ), + ( + make_test_pipeline(BaselineClassifier, BinaryClassificationPipeline), + ValueError, + baseline_message, + ), + ( + make_test_pipeline(BaselineClassifier, MulticlassClassificationPipeline), + ValueError, + baseline_message, + ), + ( + make_test_pipeline( + TimeSeriesBaselineEstimator, TimeSeriesRegressionPipeline + ), + ValueError, + baseline_message, + ), + ( + make_test_pipeline(RandomForestClassifier, BinaryClassificationPipeline), + ValueError, + datatype_message, + ), + ( + make_test_pipeline(LinearRegressor, RegressionPipeline), + ValueError, + data_message, + ), + ], +) +@patch( + "evalml.model_understanding.prediction_explanations._algorithms.shap.TreeExplainer" +) def test_value_errors_raised(mock_tree_explainer, pipeline, exception, match): if "xgboost" in pipeline.custom_name.lower(): - pytest.importorskip("xgboost", "Skipping test because xgboost is not installed.") + pytest.importorskip( + "xgboost", "Skipping test because xgboost is not installed." + ) if "catboost" in pipeline.custom_name.lower(): - pytest.importorskip("catboost", "Skipping test because catboost is not installed.") + pytest.importorskip( + "catboost", "Skipping test because catboost is not installed." + ) pipeline = pipeline({"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}}) @@ -72,7 +114,9 @@ def test_value_errors_raised(mock_tree_explainer, pipeline, exception, match): def test_create_dictionary_exception(): - with pytest.raises(ValueError, match="SHAP values must be stored in a numpy array!"): + with pytest.raises( + ValueError, match="SHAP values must be stored in a numpy array!" + ): _create_dictionary([1, 2, 3], ["a", "b", "c"]) @@ -85,18 +129,33 @@ def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain): """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline.""" points_to_explain = training_data[:n_points_to_explain] pipeline.fit(training_data, y) - return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data) + return _compute_shap_values( + pipeline, pd.DataFrame(points_to_explain), training_data + ) -interpretable_estimators = [e for e in _all_estimators_used_in_search() if e.model_family != ModelFamily.BASELINE] +interpretable_estimators = [ + e + for e in _all_estimators_used_in_search() + if e.model_family != ModelFamily.BASELINE +] all_problems = [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS] all_n_points_to_explain = [1, 5] -@pytest.mark.parametrize("estimator,problem_type,n_points_to_explain", - product(interpretable_estimators, all_problems, all_n_points_to_explain)) -def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression, - helper_functions): +@pytest.mark.parametrize( + "estimator,problem_type,n_points_to_explain", + product(interpretable_estimators, all_problems, all_n_points_to_explain), +) +def test_shap( + estimator, + problem_type, + n_points_to_explain, + X_y_binary, + X_y_multi, + X_y_regression, + helper_functions, +): if problem_type not in estimator.supported_problem_types: pytest.skip("Skipping because estimator and pipeline are not compatible.") @@ -111,41 +170,67 @@ def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_mult training_data, y = X_y_regression # TODO: Figure out why we need to change the params in order for shap to pass. Filed as issue 2281 if "Elastic Net" in estimator.name: - parameters = {"Elastic Net Classifier": {"alpha": 0.5, "l1_ratio": 0.5, 'n_jobs': 1}} + parameters = { + "Elastic Net Classifier": {"alpha": 0.5, "l1_ratio": 0.5, "n_jobs": 1} + } else: parameters = {estimator.name: {"n_jobs": 1}} try: - pipeline = make_pipeline(training_data, y, estimator, problem_type, parameters=parameters) + pipeline = make_pipeline( + training_data, y, estimator, problem_type, parameters=parameters + ) except ValueError: pipeline = make_pipeline(training_data, y, estimator, problem_type) - shap_values = calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain) + shap_values = calculate_shap_for_test( + training_data, y, pipeline, n_points_to_explain + ) if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: - assert isinstance(shap_values, list), "For binary classification, returned values must be a list" - assert all(isinstance(class_values, dict) for class_values in shap_values), "Not all list elements are lists!" + assert isinstance( + shap_values, list + ), "For binary classification, returned values must be a list" + assert all( + isinstance(class_values, dict) for class_values in shap_values + ), "Not all list elements are lists!" if is_binary: - assert len(shap_values) == N_CLASSES_BINARY, "A dictionary should be returned for each class!" + assert ( + len(shap_values) == N_CLASSES_BINARY + ), "A dictionary should be returned for each class!" else: - assert len(shap_values) == N_CLASSES_MULTICLASS, "A dictionary should be returned for each class!" + assert ( + len(shap_values) == N_CLASSES_MULTICLASS + ), "A dictionary should be returned for each class!" assert all( - len(values) == N_FEATURES for values in shap_values), "A SHAP value must be computed for every feature!" + len(values) == N_FEATURES for values in shap_values + ), "A SHAP value must be computed for every feature!" for class_values in shap_values: - assert all(isinstance(feature, list) for feature in - class_values.values()), "Every value in the dict must be a list!" - assert all(len(v) == n_points_to_explain for v in - class_values.values()), "A SHAP value must be computed for every data point to explain!" + assert all( + isinstance(feature, list) for feature in class_values.values() + ), "Every value in the dict must be a list!" + assert all( + len(v) == n_points_to_explain for v in class_values.values() + ), "A SHAP value must be computed for every data point to explain!" elif problem_type == ProblemTypes.REGRESSION: - assert isinstance(shap_values, dict), "For regression, returned values must be a dictionary!" - assert len(shap_values) == N_FEATURES, "A SHAP value should be computed for every feature!" - assert all(isinstance(feature, list) for feature in shap_values.values()), "Every value in the dict must be a list!" - assert all(len(v) == n_points_to_explain for v in - shap_values.values()), "A SHAP value must be computed for every data point to explain!" + assert isinstance( + shap_values, dict + ), "For regression, returned values must be a dictionary!" + assert ( + len(shap_values) == N_FEATURES + ), "A SHAP value should be computed for every feature!" + assert all( + isinstance(feature, list) for feature in shap_values.values() + ), "Every value in the dict must be a list!" + assert all( + len(v) == n_points_to_explain for v in shap_values.values() + ), "A SHAP value must be computed for every data point to explain!" -@patch('evalml.model_understanding.prediction_explanations._algorithms.logger') -@patch('shap.TreeExplainer') -def test_compute_shap_values_catches_shap_tree_warnings(mock_tree_explainer, mock_debug, X_y_binary, caplog): +@patch("evalml.model_understanding.prediction_explanations._algorithms.logger") +@patch("shap.TreeExplainer") +def test_compute_shap_values_catches_shap_tree_warnings( + mock_tree_explainer, mock_debug, X_y_binary, caplog +): X, y = X_y_binary pipeline = BinaryClassificationPipeline(["Random Forest Classifier"]) @@ -158,12 +243,16 @@ def raise_warning_from_shap(estimator, feature_perturbation): mock_tree_explainer.side_effect = raise_warning_from_shap _ = _compute_shap_values(pipeline, pd.DataFrame(X)) - mock_debug.debug.assert_called_with("_compute_shap_values TreeExplainer: Shap raised a warning!") + mock_debug.debug.assert_called_with( + "_compute_shap_values TreeExplainer: Shap raised a warning!" + ) def test_normalize_values_exceptions(): - with pytest.raises(ValueError, match="^Unsupported data type for _normalize_shap_values"): + with pytest.raises( + ValueError, match="^Unsupported data type for _normalize_shap_values" + ): _normalize_shap_values(1) @@ -173,19 +262,31 @@ def check_equal_dicts(normalized, answer): np.testing.assert_almost_equal(normalized[key], answer[key], decimal=4) -@pytest.mark.parametrize("values,answer", [({"a": [-0.5, 0, 0.5], "b": [0.1, -0.6, 0.2]}, - {"a": [-0.5 / 0.6, 0, 0.5 / 0.7], "b": [0.1 / 0.6, -1.0, 0.2 / 0.7]}), - ([{"a": [-0.5, 0, 0.5], "b": [0.1, -0.6, 0.2]}] * 2, - [{"a": [-0.5 / 0.6, 0, 0.5 / 0.7], "b": [0.1 / 0.6, -1.0, 0.2 / 0.7]}] * 2), - ({"a": [0, 0]}, {"a": [0, 0]}), - ([{"a": [0]}] * 10, [{"a": [0]}] * 10), - ({"a": [5], "b": [20], "c": [-22]}, - {"a": [5 / 47], "b": [20 / 47], "c": [-22 / 47]}), - ({"a": [5], "b": [-5]}, {"a": [0.5], "b": [-0.5]}), - ({0: [5], "b": [-5]}, {0: [0.5], "b": [-0.5]}), - ({"a": [-0.5, 0, 0.5], 1: [0.1, -0.6, 0.2]}, - {"a": [-0.5 / 0.6, 0, 0.5 / 0.7], 1: [0.1 / 0.6, -1.0, 0.2 / 0.7]}) - ]) +@pytest.mark.parametrize( + "values,answer", + [ + ( + {"a": [-0.5, 0, 0.5], "b": [0.1, -0.6, 0.2]}, + {"a": [-0.5 / 0.6, 0, 0.5 / 0.7], "b": [0.1 / 0.6, -1.0, 0.2 / 0.7]}, + ), + ( + [{"a": [-0.5, 0, 0.5], "b": [0.1, -0.6, 0.2]}] * 2, + [{"a": [-0.5 / 0.6, 0, 0.5 / 0.7], "b": [0.1 / 0.6, -1.0, 0.2 / 0.7]}] * 2, + ), + ({"a": [0, 0]}, {"a": [0, 0]}), + ([{"a": [0]}] * 10, [{"a": [0]}] * 10), + ( + {"a": [5], "b": [20], "c": [-22]}, + {"a": [5 / 47], "b": [20 / 47], "c": [-22 / 47]}, + ), + ({"a": [5], "b": [-5]}, {"a": [0.5], "b": [-0.5]}), + ({0: [5], "b": [-5]}, {0: [0.5], "b": [-0.5]}), + ( + {"a": [-0.5, 0, 0.5], 1: [0.1, -0.6, 0.2]}, + {"a": [-0.5 / 0.6, 0, 0.5 / 0.7], 1: [0.1 / 0.6, -1.0, 0.2 / 0.7]}, + ), + ], +) def test_normalize_values(values, answer): normalized = _normalize_shap_values(values) @@ -198,14 +299,27 @@ def test_normalize_values(values, answer): check_equal_dicts(values, correct) -@pytest.mark.parametrize("values,provenance,answer", [({"a_0": [-0.5, 0, 0.5], "a_1": [1, 1, 2], "b": [0.1, -0.6, 0.2]}, - {"a": ["a_0", "a_1"]}, - {"a": [0.5, 1, 2.5], "b": [0.1, -0.6, 0.2]}), - ([{"a_0": [0.5, 1.0, 2.0], "a_1": [1.2, 1.5, 0.6], "b": [0.5, 0.2, 0.5]}, - {"a_0": [-0.5, 0, 0.5], "a_1": [1, 1, 2], "b": [0.1, -0.6, 0.2]}], - {"a": ["a_0", "a_1"], "c": ["c_1", "c_2"]}, - [{"a": [1.7, 2.5, 2.6], "b": [0.5, 0.2, 0.5]}, - {"a": [0.5, 1, 2.5], "b": [0.1, -0.6, 0.2]}])]) +@pytest.mark.parametrize( + "values,provenance,answer", + [ + ( + {"a_0": [-0.5, 0, 0.5], "a_1": [1, 1, 2], "b": [0.1, -0.6, 0.2]}, + {"a": ["a_0", "a_1"]}, + {"a": [0.5, 1, 2.5], "b": [0.1, -0.6, 0.2]}, + ), + ( + [ + {"a_0": [0.5, 1.0, 2.0], "a_1": [1.2, 1.5, 0.6], "b": [0.5, 0.2, 0.5]}, + {"a_0": [-0.5, 0, 0.5], "a_1": [1, 1, 2], "b": [0.1, -0.6, 0.2]}, + ], + {"a": ["a_0", "a_1"], "c": ["c_1", "c_2"]}, + [ + {"a": [1.7, 2.5, 2.6], "b": [0.5, 0.2, 0.5]}, + {"a": [0.5, 1, 2.5], "b": [0.1, -0.6, 0.2]}, + ], + ), + ], +) def test_aggregate_values(values, provenance, answer): aggregated = _aggregate_shap_values(values, provenance) diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index 34bfd4f230..2085b7b585 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -12,22 +12,17 @@ abs_error, cross_entropy, explain_predictions, - explain_predictions_best_worst + explain_predictions_best_worst, ) from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline, TimeSeriesBinaryClassificationPipeline, - TimeSeriesRegressionPipeline + TimeSeriesRegressionPipeline, ) from evalml.pipelines.components.utils import _all_estimators -from evalml.problem_types import ( - ProblemTypes, - is_binary, - is_multiclass, - is_regression -) +from evalml.problem_types import ProblemTypes, is_binary, is_multiclass, is_regression def compare_two_tables(table_1, table_2): @@ -38,20 +33,42 @@ def compare_two_tables(table_1, table_2): def test_error_metrics(): - np.testing.assert_array_equal(abs_error(pd.Series([1, 2, 3]), pd.Series([4, 1, 0])), np.array([3, 1, 3])) - np.testing.assert_allclose(cross_entropy(pd.Series([1, 0]), - pd.DataFrame({"a": [0.1, 0.2], "b": [0.9, 0.8]})), - np.array([-np.log(0.9), -np.log(0.2)])) - - -input_features_and_y_true = [([[1]], pd.Series([1]), "^Input features must be a dataframe with more than 10 rows!"), - (pd.DataFrame({"a": [1]}), pd.Series([1]), "^Input features must be a dataframe with more than 10 rows!"), - (pd.DataFrame({"a": range(15)}), pd.Series(range(12)), "^Parameters y_true and input_features must have the same number of data points.") - ] - - -@pytest.mark.parametrize("input_features,y_true,error_message", input_features_and_y_true) -def test_explain_predictions_best_worst_value_errors(input_features, y_true, error_message): + np.testing.assert_array_equal( + abs_error(pd.Series([1, 2, 3]), pd.Series([4, 1, 0])), np.array([3, 1, 3]) + ) + np.testing.assert_allclose( + cross_entropy( + pd.Series([1, 0]), pd.DataFrame({"a": [0.1, 0.2], "b": [0.9, 0.8]}) + ), + np.array([-np.log(0.9), -np.log(0.2)]), + ) + + +input_features_and_y_true = [ + ( + [[1]], + pd.Series([1]), + "^Input features must be a dataframe with more than 10 rows!", + ), + ( + pd.DataFrame({"a": [1]}), + pd.Series([1]), + "^Input features must be a dataframe with more than 10 rows!", + ), + ( + pd.DataFrame({"a": range(15)}), + pd.Series(range(12)), + "^Parameters y_true and input_features must have the same number of data points.", + ), +] + + +@pytest.mark.parametrize( + "input_features,y_true,error_message", input_features_and_y_true +) +def test_explain_predictions_best_worst_value_errors( + input_features, y_true, error_message +): with pytest.raises(ValueError, match=error_message): explain_predictions_best_worst(None, input_features, y_true) @@ -65,31 +82,67 @@ def raise_zero_division(input_features): pipeline = MagicMock() pipeline.problem_type = ProblemTypes.BINARY pipeline.predict_proba.side_effect = raise_zero_division - explain_predictions_best_worst(pipeline, pd.DataFrame({"a": range(15)}), pd.Series(range(15))) + explain_predictions_best_worst( + pipeline, pd.DataFrame({"a": range(15)}), pd.Series(range(15)) + ) def test_explain_predictions_value_errors(): - with pytest.raises(ValueError, match="Parameter input_features must be a non-empty dataframe."): + with pytest.raises( + ValueError, match="Parameter input_features must be a non-empty dataframe." + ): explain_predictions(MagicMock(), pd.DataFrame(), y=None, indices_to_explain=[0]) with pytest.raises(ValueError, match="Explained indices should be between"): - explain_predictions(MagicMock(), pd.DataFrame({"a": [0, 1, 2, 3, 4]}), y=None, indices_to_explain=[5]) + explain_predictions( + MagicMock(), + pd.DataFrame({"a": [0, 1, 2, 3, 4]}), + y=None, + indices_to_explain=[5], + ) with pytest.raises(ValueError, match="Explained indices should be between"): - explain_predictions(MagicMock(), pd.DataFrame({"a": [0, 1, 2, 3, 4]}), y=None, indices_to_explain=[1, 5]) + explain_predictions( + MagicMock(), + pd.DataFrame({"a": [0, 1, 2, 3, 4]}), + y=None, + indices_to_explain=[1, 5], + ) with pytest.raises(ValueError, match="Explained indices should be between"): - explain_predictions(MagicMock(), pd.DataFrame({"a": [0, 1, 2, 3, 4]}), y=None, indices_to_explain=[-1]) + explain_predictions( + MagicMock(), + pd.DataFrame({"a": [0, 1, 2, 3, 4]}), + y=None, + indices_to_explain=[-1], + ) def test_output_format_checked(): input_features, y_true = pd.DataFrame(data=[range(15)]), pd.Series(range(15)) - with pytest.raises(ValueError, match="Parameter output_format must be either text, dict, or dataframe. Received bar"): - explain_predictions(pipeline=MagicMock(), input_features=input_features, y=None, indices_to_explain=0, output_format="bar") + with pytest.raises( + ValueError, + match="Parameter output_format must be either text, dict, or dataframe. Received bar", + ): + explain_predictions( + pipeline=MagicMock(), + input_features=input_features, + y=None, + indices_to_explain=0, + output_format="bar", + ) input_features, y_true = pd.DataFrame(data=range(15)), pd.Series(range(15)) - with pytest.raises(ValueError, match="Parameter output_format must be either text, dict, or dataframe. Received foo"): - explain_predictions_best_worst(pipeline=MagicMock(), input_features=input_features, y_true=y_true, output_format="foo") + with pytest.raises( + ValueError, + match="Parameter output_format must be either text, dict, or dataframe. Received foo", + ): + explain_predictions_best_worst( + pipeline=MagicMock(), + input_features=input_features, + y_true=y_true, + output_format="foo", + ) regression_best_worst_answer = """Test Pipeline Name @@ -120,29 +173,45 @@ def test_output_format_checked(): regression_best_worst_answer_dict = { "explanations": [ - {"rank": {"prefix": "best", "index": 1}, - "predicted_values": {"probabilities": None, "predicted_value": 1, "target_value": 2, - "error_name": "Absolute Difference", "error_value": 1.}, - "explanations": ["explanation_dictionary_goes_here"]}, - {"rank": {"prefix": "worst", "index": 1}, - "predicted_values": {"probabilities": None, "predicted_value": 2, "target_value": 3, - "error_name": "Absolute Difference", "error_value": 4.}, - "explanations": ["explanation_dictionary_goes_here"]} + { + "rank": {"prefix": "best", "index": 1}, + "predicted_values": { + "probabilities": None, + "predicted_value": 1, + "target_value": 2, + "error_name": "Absolute Difference", + "error_value": 1.0, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, + { + "rank": {"prefix": "worst", "index": 1}, + "predicted_values": { + "probabilities": None, + "predicted_value": 2, + "target_value": 3, + "error_name": "Absolute Difference", + "error_value": 4.0, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, ] } -regression_best_worst_answer_df = pd.DataFrame({ - "feature_names": [0, 0], - "feature_values": [0, 0], - "qualitative_explanation": [0, 0], - "quantitative_explanation": [0, 0], - "rank": [1, 1], - "predicted_value": [1, 2], - "target_value": [2, 3], - "error_name": ["Absolute Difference"] * 2, - "error_value": [1., 4.], - "prefix": ["best", "worst"], -}) +regression_best_worst_answer_df = pd.DataFrame( + { + "feature_names": [0, 0], + "feature_values": [0, 0], + "qualitative_explanation": [0, 0], + "quantitative_explanation": [0, 0], + "rank": [1, 1], + "predicted_value": [1, 2], + "target_value": [2, 3], + "error_name": ["Absolute Difference"] * 2, + "error_value": [1.0, 4.0], + "prefix": ["best", "worst"], + } +) no_best_worst_answer = """Test Pipeline Name @@ -163,17 +232,19 @@ def test_output_format_checked(): no_best_worst_answer_dict = { "explanations": [ {"explanations": ["explanation_dictionary_goes_here"]}, - {"explanations": ["explanation_dictionary_goes_here"]} + {"explanations": ["explanation_dictionary_goes_here"]}, ] } -no_best_worst_answer_df = pd.DataFrame({ - "feature_names": [0, 0], - "feature_values": [0, 0], - "qualitative_explanation": [0, 0], - "quantitative_explanation": [0, 0], - "prediction_number": [0, 1] -}) +no_best_worst_answer_df = pd.DataFrame( + { + "feature_names": [0, 0], + "feature_values": [0, 0], + "qualitative_explanation": [0, 0], + "quantitative_explanation": [0, 0], + "prediction_number": [0, 1], + } +) binary_best_worst_answer = """Test Pipeline Name @@ -205,33 +276,47 @@ def test_output_format_checked(): binary_best_worst_answer_dict = { "explanations": [ - {"rank": {"prefix": "best", "index": 1}, - "predicted_values": {"probabilities": {"benign": 0.05, "malignant": 0.95}, - "predicted_value": "malignant", "target_value": "malignant", - "error_name": "Cross Entropy", "error_value": 0.2}, - "explanations": ["explanation_dictionary_goes_here"]}, - {"rank": {"prefix": "worst", "index": 1}, - "predicted_values": {"probabilities": {"benign": 0.1, "malignant": 0.9}, - "predicted_value": "malignant", "target_value": "benign", - "error_name": "Cross Entropy", "error_value": 0.78}, - "explanations": ["explanation_dictionary_goes_here"]} + { + "rank": {"prefix": "best", "index": 1}, + "predicted_values": { + "probabilities": {"benign": 0.05, "malignant": 0.95}, + "predicted_value": "malignant", + "target_value": "malignant", + "error_name": "Cross Entropy", + "error_value": 0.2, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, + { + "rank": {"prefix": "worst", "index": 1}, + "predicted_values": { + "probabilities": {"benign": 0.1, "malignant": 0.9}, + "predicted_value": "malignant", + "target_value": "benign", + "error_name": "Cross Entropy", + "error_value": 0.78, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, ] } -binary_best_worst_answer_df = pd.DataFrame({ - "feature_names": [0, 0], - "feature_values": [0, 0], - "qualitative_explanation": [0, 0], - "quantitative_explanation": [0, 0], - "rank": [1, 1], - "prefix": ["best", "worst"], - "label_benign_probability": [0.05, 0.1], - "label_malignant_probability": [0.95, 0.9], - "predicted_value": ["malignant", "malignant"], - "target_value": ["malignant", "benign"], - "error_name": ["Cross Entropy"] * 2, - "error_value": [0.2, 0.78] -}) +binary_best_worst_answer_df = pd.DataFrame( + { + "feature_names": [0, 0], + "feature_values": [0, 0], + "qualitative_explanation": [0, 0], + "quantitative_explanation": [0, 0], + "rank": [1, 1], + "prefix": ["best", "worst"], + "label_benign_probability": [0.05, 0.1], + "label_malignant_probability": [0.95, 0.9], + "predicted_value": ["malignant", "malignant"], + "target_value": ["malignant", "benign"], + "error_name": ["Cross Entropy"] * 2, + "error_value": [0.2, 0.78], + } +) multiclass_table = """Class: setosa @@ -273,38 +358,54 @@ def test_output_format_checked(): {multiclass_table} -""".format(multiclass_table=multiclass_table) +""".format( + multiclass_table=multiclass_table +) multiclass_best_worst_answer_dict = { "explanations": [ - {"rank": {"prefix": "best", "index": 1}, - "predicted_values": {"probabilities": {"setosa": 0.8, "versicolor": 0.1, "virginica": 0.1}, - "predicted_value": "setosa", "target_value": "setosa", - "error_name": "Cross Entropy", "error_value": 0.15}, - "explanations": ["explanation_dictionary_goes_here"]}, - {"rank": {"prefix": "worst", "index": 1}, - "predicted_values": {"probabilities": {"setosa": 0.2, "versicolor": 0.75, "virginica": 0.05}, - "predicted_value": "versicolor", "target_value": "versicolor", - "error_name": "Cross Entropy", "error_value": 0.34}, - "explanations": ["explanation_dictionary_goes_here"]} + { + "rank": {"prefix": "best", "index": 1}, + "predicted_values": { + "probabilities": {"setosa": 0.8, "versicolor": 0.1, "virginica": 0.1}, + "predicted_value": "setosa", + "target_value": "setosa", + "error_name": "Cross Entropy", + "error_value": 0.15, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, + { + "rank": {"prefix": "worst", "index": 1}, + "predicted_values": { + "probabilities": {"setosa": 0.2, "versicolor": 0.75, "virginica": 0.05}, + "predicted_value": "versicolor", + "target_value": "versicolor", + "error_name": "Cross Entropy", + "error_value": 0.34, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, ] } -multiclass_best_worst_answer_df = pd.DataFrame({ - "feature_names": [0, 0], - "feature_values": [0, 0], - "qualitative_explanation": [0, 0], - "quantitative_explanation": [0, 0], - "rank": [1, 1], - "prefix": ["best", "worst"], - "label_setosa_probability": [0.8, 0.2], - "label_versicolor_probability": [0.1, 0.75], - "label_virginica_probability": [0.1, 0.05], - "predicted_value": ["setosa", "versicolor"], - "target_value": ["setosa", "versicolor"], - "error_name": ["Cross Entropy"] * 2, - "error_value": [0.15, 0.34] -}) +multiclass_best_worst_answer_df = pd.DataFrame( + { + "feature_names": [0, 0], + "feature_values": [0, 0], + "qualitative_explanation": [0, 0], + "quantitative_explanation": [0, 0], + "rank": [1, 1], + "prefix": ["best", "worst"], + "label_setosa_probability": [0.8, 0.2], + "label_versicolor_probability": [0.1, 0.75], + "label_virginica_probability": [0.1, 0.05], + "predicted_value": ["setosa", "versicolor"], + "target_value": ["setosa", "versicolor"], + "error_name": ["Cross Entropy"] * 2, + "error_value": [0.15, 0.34], + } +) multiclass_no_best_worst_answer = """Test Pipeline Name @@ -320,83 +421,424 @@ def test_output_format_checked(): {multiclass_table} -""".format(multiclass_table=multiclass_table) - - -@pytest.mark.parametrize("problem_type,output_format,answer,explain_predictions_answer,custom_index", - [(ProblemTypes.REGRESSION, "text", regression_best_worst_answer, no_best_worst_answer, [0, 1]), - (ProblemTypes.REGRESSION, "dict", regression_best_worst_answer_dict, no_best_worst_answer_dict, [0, 1]), - (ProblemTypes.REGRESSION, "dataframe", regression_best_worst_answer_df, no_best_worst_answer_df, [0, 1]), - (ProblemTypes.REGRESSION, "text", regression_best_worst_answer, no_best_worst_answer, [4, 23]), - (ProblemTypes.REGRESSION, "dict", regression_best_worst_answer_dict, no_best_worst_answer_dict, [4, 10]), - (ProblemTypes.REGRESSION, "dataframe", regression_best_worst_answer_df, no_best_worst_answer_df, [4, 10]), - (ProblemTypes.REGRESSION, "text", regression_best_worst_answer, no_best_worst_answer, ["foo", "bar"]), - (ProblemTypes.REGRESSION, "dict", regression_best_worst_answer_dict, no_best_worst_answer_dict, ["foo", "bar"]), - (ProblemTypes.REGRESSION, "dataframe", regression_best_worst_answer_df, no_best_worst_answer_df, ["foo", "bar"]), - (ProblemTypes.BINARY, "text", binary_best_worst_answer, no_best_worst_answer, [0, 1]), - (ProblemTypes.BINARY, "dict", binary_best_worst_answer_dict, no_best_worst_answer_dict, [0, 1]), - (ProblemTypes.BINARY, "dataframe", binary_best_worst_answer_df, no_best_worst_answer_df, [0, 1]), - (ProblemTypes.BINARY, "text", binary_best_worst_answer, no_best_worst_answer, [7, 11]), - (ProblemTypes.BINARY, "dict", binary_best_worst_answer_dict, no_best_worst_answer_dict, [7, 11]), - (ProblemTypes.BINARY, "dataframe", binary_best_worst_answer_df, no_best_worst_answer_df, [7, 11]), - (ProblemTypes.BINARY, "text", binary_best_worst_answer, no_best_worst_answer, ["first", "second"]), - (ProblemTypes.BINARY, "dict", binary_best_worst_answer_dict, no_best_worst_answer_dict, ["first", "second"]), - (ProblemTypes.BINARY, "dataframe", binary_best_worst_answer_df, no_best_worst_answer_df, ["first", "second"]), - (ProblemTypes.MULTICLASS, "text", multiclass_best_worst_answer, multiclass_no_best_worst_answer, [0, 1]), - (ProblemTypes.MULTICLASS, "dict", multiclass_best_worst_answer_dict, no_best_worst_answer_dict, [0, 1]), - (ProblemTypes.MULTICLASS, "dataframe", multiclass_best_worst_answer_df, no_best_worst_answer_df, [0, 1]), - (ProblemTypes.MULTICLASS, "text", multiclass_best_worst_answer, multiclass_no_best_worst_answer, [19, 103]), - (ProblemTypes.MULTICLASS, "dict", multiclass_best_worst_answer_dict, no_best_worst_answer_dict, [17, 235]), - (ProblemTypes.MULTICLASS, "dataframe", multiclass_best_worst_answer_df, no_best_worst_answer_df, [17, 235]), - (ProblemTypes.MULTICLASS, "text", multiclass_best_worst_answer, multiclass_no_best_worst_answer, ["2020-10", "2020-11"]), - (ProblemTypes.MULTICLASS, "dict", multiclass_best_worst_answer_dict, no_best_worst_answer_dict, ["2020-15", "2020-15"]), - (ProblemTypes.MULTICLASS, "dataframe", multiclass_best_worst_answer_df, no_best_worst_answer_df, ["2020-15", "2020-15"]), - (ProblemTypes.TIME_SERIES_REGRESSION, "text", regression_best_worst_answer, no_best_worst_answer, [0, 1]), - (ProblemTypes.TIME_SERIES_REGRESSION, "dict", regression_best_worst_answer_dict, no_best_worst_answer_dict, [0, 1]), - (ProblemTypes.TIME_SERIES_REGRESSION, "dataframe", regression_best_worst_answer_df, no_best_worst_answer_df, [0, 1]), - (ProblemTypes.TIME_SERIES_REGRESSION, "text", regression_best_worst_answer, no_best_worst_answer, [4, 23]), - (ProblemTypes.TIME_SERIES_REGRESSION, "dict", regression_best_worst_answer_dict, no_best_worst_answer_dict, [4, 10]), - (ProblemTypes.TIME_SERIES_REGRESSION, "dataframe", regression_best_worst_answer_df, no_best_worst_answer_df, [4, 10]), - (ProblemTypes.TIME_SERIES_REGRESSION, "text", regression_best_worst_answer, no_best_worst_answer, ["foo", "bar"]), - (ProblemTypes.TIME_SERIES_REGRESSION, "dict", regression_best_worst_answer_dict, no_best_worst_answer_dict, ["foo", "bar"]), - (ProblemTypes.TIME_SERIES_REGRESSION, "dataframe", regression_best_worst_answer_df, no_best_worst_answer_df, ["foo", "bar"]), - (ProblemTypes.TIME_SERIES_BINARY, "text", binary_best_worst_answer, no_best_worst_answer, [0, 1]), - (ProblemTypes.TIME_SERIES_BINARY, "dict", binary_best_worst_answer_dict, no_best_worst_answer_dict, [0, 1]), - (ProblemTypes.TIME_SERIES_BINARY, "dataframe", binary_best_worst_answer_df, no_best_worst_answer_df, [0, 1]), - (ProblemTypes.TIME_SERIES_BINARY, "text", binary_best_worst_answer, no_best_worst_answer, [7, 11]), - (ProblemTypes.TIME_SERIES_BINARY, "dict", binary_best_worst_answer_dict, no_best_worst_answer_dict, [7, 11]), - (ProblemTypes.TIME_SERIES_BINARY, "dataframe", binary_best_worst_answer_df, no_best_worst_answer_df, [7, 11]), - (ProblemTypes.TIME_SERIES_BINARY, "text", binary_best_worst_answer, no_best_worst_answer, ["first", "second"]), - (ProblemTypes.TIME_SERIES_BINARY, "dict", binary_best_worst_answer_dict, no_best_worst_answer_dict, ["first", "second"]), - (ProblemTypes.TIME_SERIES_BINARY, "dataframe", binary_best_worst_answer_df, no_best_worst_answer_df, ["first", "second"]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "text", multiclass_best_worst_answer, multiclass_no_best_worst_answer, [0, 1]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "dict", multiclass_best_worst_answer_dict, no_best_worst_answer_dict, [0, 1]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "dataframe", multiclass_best_worst_answer_df, no_best_worst_answer_df, [0, 1]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "text", multiclass_best_worst_answer, multiclass_no_best_worst_answer, [19, 103]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "dict", multiclass_best_worst_answer_dict, no_best_worst_answer_dict, [17, 235]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "dataframe", multiclass_best_worst_answer_df, no_best_worst_answer_df, [17, 235]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "text", multiclass_best_worst_answer, multiclass_no_best_worst_answer, ["2020-10", "2020-11"]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "dict", multiclass_best_worst_answer_dict, no_best_worst_answer_dict, ["2020-15", "2020-15"]), - (ProblemTypes.TIME_SERIES_MULTICLASS, "dataframe", multiclass_best_worst_answer_df, no_best_worst_answer_df, ["2020-15", "2020-15"]), - ]) +""".format( + multiclass_table=multiclass_table +) + + +@pytest.mark.parametrize( + "problem_type,output_format,answer,explain_predictions_answer,custom_index", + [ + ( + ProblemTypes.REGRESSION, + "text", + regression_best_worst_answer, + no_best_worst_answer, + [0, 1], + ), + ( + ProblemTypes.REGRESSION, + "dict", + regression_best_worst_answer_dict, + no_best_worst_answer_dict, + [0, 1], + ), + ( + ProblemTypes.REGRESSION, + "dataframe", + regression_best_worst_answer_df, + no_best_worst_answer_df, + [0, 1], + ), + ( + ProblemTypes.REGRESSION, + "text", + regression_best_worst_answer, + no_best_worst_answer, + [4, 23], + ), + ( + ProblemTypes.REGRESSION, + "dict", + regression_best_worst_answer_dict, + no_best_worst_answer_dict, + [4, 10], + ), + ( + ProblemTypes.REGRESSION, + "dataframe", + regression_best_worst_answer_df, + no_best_worst_answer_df, + [4, 10], + ), + ( + ProblemTypes.REGRESSION, + "text", + regression_best_worst_answer, + no_best_worst_answer, + ["foo", "bar"], + ), + ( + ProblemTypes.REGRESSION, + "dict", + regression_best_worst_answer_dict, + no_best_worst_answer_dict, + ["foo", "bar"], + ), + ( + ProblemTypes.REGRESSION, + "dataframe", + regression_best_worst_answer_df, + no_best_worst_answer_df, + ["foo", "bar"], + ), + ( + ProblemTypes.BINARY, + "text", + binary_best_worst_answer, + no_best_worst_answer, + [0, 1], + ), + ( + ProblemTypes.BINARY, + "dict", + binary_best_worst_answer_dict, + no_best_worst_answer_dict, + [0, 1], + ), + ( + ProblemTypes.BINARY, + "dataframe", + binary_best_worst_answer_df, + no_best_worst_answer_df, + [0, 1], + ), + ( + ProblemTypes.BINARY, + "text", + binary_best_worst_answer, + no_best_worst_answer, + [7, 11], + ), + ( + ProblemTypes.BINARY, + "dict", + binary_best_worst_answer_dict, + no_best_worst_answer_dict, + [7, 11], + ), + ( + ProblemTypes.BINARY, + "dataframe", + binary_best_worst_answer_df, + no_best_worst_answer_df, + [7, 11], + ), + ( + ProblemTypes.BINARY, + "text", + binary_best_worst_answer, + no_best_worst_answer, + ["first", "second"], + ), + ( + ProblemTypes.BINARY, + "dict", + binary_best_worst_answer_dict, + no_best_worst_answer_dict, + ["first", "second"], + ), + ( + ProblemTypes.BINARY, + "dataframe", + binary_best_worst_answer_df, + no_best_worst_answer_df, + ["first", "second"], + ), + ( + ProblemTypes.MULTICLASS, + "text", + multiclass_best_worst_answer, + multiclass_no_best_worst_answer, + [0, 1], + ), + ( + ProblemTypes.MULTICLASS, + "dict", + multiclass_best_worst_answer_dict, + no_best_worst_answer_dict, + [0, 1], + ), + ( + ProblemTypes.MULTICLASS, + "dataframe", + multiclass_best_worst_answer_df, + no_best_worst_answer_df, + [0, 1], + ), + ( + ProblemTypes.MULTICLASS, + "text", + multiclass_best_worst_answer, + multiclass_no_best_worst_answer, + [19, 103], + ), + ( + ProblemTypes.MULTICLASS, + "dict", + multiclass_best_worst_answer_dict, + no_best_worst_answer_dict, + [17, 235], + ), + ( + ProblemTypes.MULTICLASS, + "dataframe", + multiclass_best_worst_answer_df, + no_best_worst_answer_df, + [17, 235], + ), + ( + ProblemTypes.MULTICLASS, + "text", + multiclass_best_worst_answer, + multiclass_no_best_worst_answer, + ["2020-10", "2020-11"], + ), + ( + ProblemTypes.MULTICLASS, + "dict", + multiclass_best_worst_answer_dict, + no_best_worst_answer_dict, + ["2020-15", "2020-15"], + ), + ( + ProblemTypes.MULTICLASS, + "dataframe", + multiclass_best_worst_answer_df, + no_best_worst_answer_df, + ["2020-15", "2020-15"], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "text", + regression_best_worst_answer, + no_best_worst_answer, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "dict", + regression_best_worst_answer_dict, + no_best_worst_answer_dict, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "dataframe", + regression_best_worst_answer_df, + no_best_worst_answer_df, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "text", + regression_best_worst_answer, + no_best_worst_answer, + [4, 23], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "dict", + regression_best_worst_answer_dict, + no_best_worst_answer_dict, + [4, 10], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "dataframe", + regression_best_worst_answer_df, + no_best_worst_answer_df, + [4, 10], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "text", + regression_best_worst_answer, + no_best_worst_answer, + ["foo", "bar"], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "dict", + regression_best_worst_answer_dict, + no_best_worst_answer_dict, + ["foo", "bar"], + ), + ( + ProblemTypes.TIME_SERIES_REGRESSION, + "dataframe", + regression_best_worst_answer_df, + no_best_worst_answer_df, + ["foo", "bar"], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "text", + binary_best_worst_answer, + no_best_worst_answer, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "dict", + binary_best_worst_answer_dict, + no_best_worst_answer_dict, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "dataframe", + binary_best_worst_answer_df, + no_best_worst_answer_df, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "text", + binary_best_worst_answer, + no_best_worst_answer, + [7, 11], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "dict", + binary_best_worst_answer_dict, + no_best_worst_answer_dict, + [7, 11], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "dataframe", + binary_best_worst_answer_df, + no_best_worst_answer_df, + [7, 11], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "text", + binary_best_worst_answer, + no_best_worst_answer, + ["first", "second"], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "dict", + binary_best_worst_answer_dict, + no_best_worst_answer_dict, + ["first", "second"], + ), + ( + ProblemTypes.TIME_SERIES_BINARY, + "dataframe", + binary_best_worst_answer_df, + no_best_worst_answer_df, + ["first", "second"], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "text", + multiclass_best_worst_answer, + multiclass_no_best_worst_answer, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "dict", + multiclass_best_worst_answer_dict, + no_best_worst_answer_dict, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "dataframe", + multiclass_best_worst_answer_df, + no_best_worst_answer_df, + [0, 1], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "text", + multiclass_best_worst_answer, + multiclass_no_best_worst_answer, + [19, 103], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "dict", + multiclass_best_worst_answer_dict, + no_best_worst_answer_dict, + [17, 235], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "dataframe", + multiclass_best_worst_answer_df, + no_best_worst_answer_df, + [17, 235], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "text", + multiclass_best_worst_answer, + multiclass_no_best_worst_answer, + ["2020-10", "2020-11"], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "dict", + multiclass_best_worst_answer_dict, + no_best_worst_answer_dict, + ["2020-15", "2020-15"], + ), + ( + ProblemTypes.TIME_SERIES_MULTICLASS, + "dataframe", + multiclass_best_worst_answer_df, + no_best_worst_answer_df, + ["2020-15", "2020-15"], + ), + ], +) @patch("evalml.model_understanding.prediction_explanations.explainers.DEFAULT_METRICS") -@patch("evalml.model_understanding.prediction_explanations._user_interface._make_single_prediction_shap_table") -def test_explain_predictions_best_worst_and_explain_predictions(mock_make_table, mock_default_metrics, - problem_type, output_format, answer, - explain_predictions_answer, custom_index): +@patch( + "evalml.model_understanding.prediction_explanations._user_interface._make_single_prediction_shap_table" +) +def test_explain_predictions_best_worst_and_explain_predictions( + mock_make_table, + mock_default_metrics, + problem_type, + output_format, + answer, + explain_predictions_answer, + custom_index, +): if output_format == "text": mock_make_table.return_value = "table goes here" elif output_format == "dataframe": - shap_table = pd.DataFrame({ - "feature_names": [0], - "feature_values": [0], - "qualitative_explanation": [0], - "quantitative_explanation": [0], - }) + shap_table = pd.DataFrame( + { + "feature_names": [0], + "feature_values": [0], + "qualitative_explanation": [0], + "quantitative_explanation": [0], + } + ) # Use side effect so that we always get a new copy of the dataframe mock_make_table.side_effect = lambda *args, **kwargs: shap_table.copy() else: - mock_make_table.return_value = {"explanations": ["explanation_dictionary_goes_here"]} + mock_make_table.return_value = { + "explanations": ["explanation_dictionary_goes_here"] + } pipeline = MagicMock() pipeline.parameters = "Parameters go here" @@ -413,7 +855,7 @@ def _add_custom_index(answer, index_best, index_worst, output_format): elif output_format == "dataframe": col_name = "prefix" if "prefix" in answer.columns else "rank" n_repeats = answer[col_name].value_counts().tolist()[0] - answer['index_id'] = [index_best] * n_repeats + [index_worst] * n_repeats + answer["index_id"] = [index_best] * n_repeats + [index_worst] * n_repeats else: answer["explanations"][0]["predicted_values"]["index_id"] = index_best answer["explanations"][1]["predicted_values"]["index_id"] = index_worst @@ -421,12 +863,16 @@ def _add_custom_index(answer, index_best, index_worst, output_format): if is_regression(problem_type): abs_error_mock = MagicMock(__name__="abs_error") - abs_error_mock.return_value = pd.Series([4., 1.], dtype="float64") + abs_error_mock.return_value = pd.Series([4.0, 1.0], dtype="float64") mock_default_metrics.__getitem__.return_value = abs_error_mock pipeline.predict.return_value = ww.init_series(pd.Series([2, 1])) y_true = pd.Series([3, 2], index=custom_index) - answer = _add_custom_index(answer, index_best=custom_index[1], - index_worst=custom_index[0], output_format=output_format) + answer = _add_custom_index( + answer, + index_best=custom_index[1], + index_worst=custom_index[0], + output_format=output_format, + ) elif is_binary(problem_type): pipeline.classes_.return_value = ["benign", "malignant"] cross_entropy_mock = MagicMock(__name__="cross_entropy") @@ -437,8 +883,12 @@ def _add_custom_index(answer, index_best, index_worst, output_format): pipeline.predict_proba.return_value = proba pipeline.predict.return_value = ww.init_series(pd.Series(["malignant"] * 2)) y_true = pd.Series(["malignant", "benign"], index=custom_index) - answer = _add_custom_index(answer, index_best=custom_index[0], - index_worst=custom_index[1], output_format=output_format) + answer = _add_custom_index( + answer, + index_best=custom_index[0], + index_worst=custom_index[1], + output_format=output_format, + ) else: # Multiclass text output is formatted slightly different so need to account for that if output_format == "text": @@ -447,31 +897,56 @@ def _add_custom_index(answer, index_best, index_worst, output_format): cross_entropy_mock = MagicMock(__name__="cross_entropy") mock_default_metrics.__getitem__.return_value = cross_entropy_mock cross_entropy_mock.return_value = pd.Series([0.15, 0.34]) - proba = pd.DataFrame({"setosa": [0.8, 0.2], "versicolor": [0.1, 0.75], "virginica": [0.1, 0.05]}) + proba = pd.DataFrame( + {"setosa": [0.8, 0.2], "versicolor": [0.1, 0.75], "virginica": [0.1, 0.05]} + ) proba.ww.init() pipeline.predict_proba.return_value = proba - pipeline.predict.return_value = ww.init_series(pd.Series(["setosa", "versicolor"])) + pipeline.predict.return_value = ww.init_series( + pd.Series(["setosa", "versicolor"]) + ) y_true = pd.Series(["setosa", "versicolor"], index=custom_index) - answer = _add_custom_index(answer, index_best=custom_index[0], - index_worst=custom_index[1], output_format=output_format) - - report = explain_predictions(pipeline, input_features, y=y_true, indices_to_explain=[0, 1], output_format=output_format) + answer = _add_custom_index( + answer, + index_best=custom_index[0], + index_worst=custom_index[1], + output_format=output_format, + ) + + report = explain_predictions( + pipeline, + input_features, + y=y_true, + indices_to_explain=[0, 1], + output_format=output_format, + ) if output_format == "text": compare_two_tables(report.splitlines(), explain_predictions_answer.splitlines()) elif output_format == "dataframe": assert report.columns.tolist() == explain_predictions_answer.columns.tolist() - pd.testing.assert_frame_equal(report, explain_predictions_answer[report.columns]) + pd.testing.assert_frame_equal( + report, explain_predictions_answer[report.columns] + ) else: assert report == explain_predictions_answer - best_worst_report = explain_predictions_best_worst(pipeline, input_features, y_true=y_true, - num_to_explain=1, output_format=output_format) + best_worst_report = explain_predictions_best_worst( + pipeline, + input_features, + y_true=y_true, + num_to_explain=1, + output_format=output_format, + ) if output_format == "text": compare_two_tables(best_worst_report.splitlines(), answer.splitlines()) elif output_format == "dataframe": # Check dataframes equal without caring about column order - assert sorted(best_worst_report.columns.tolist()) == sorted(answer.columns.tolist()) - pd.testing.assert_frame_equal(best_worst_report, answer[best_worst_report.columns]) + assert sorted(best_worst_report.columns.tolist()) == sorted( + answer.columns.tolist() + ) + pd.testing.assert_frame_equal( + best_worst_report, answer[best_worst_report.columns] + ) else: assert best_worst_report == answer @@ -504,27 +979,53 @@ def _add_custom_index(answer, index_best, index_worst, output_format): regression_custom_metric_answer_dict = { "explanations": [ - {"rank": {"prefix": "best", "index": 1}, - "predicted_values": {"probabilities": None, "predicted_value": 1, "target_value": 2, - "error_name": "sum", "error_value": 3, - "index_id": 1}, - "explanations": ["explanation_dictionary_goes_here"]}, - {"rank": {"prefix": "worst", "index": 1}, - "predicted_values": {"probabilities": None, "predicted_value": 2, "target_value": 3, - "error_name": "sum", "error_value": 5, - "index_id": 0}, - "explanations": ["explanation_dictionary_goes_here"]} + { + "rank": {"prefix": "best", "index": 1}, + "predicted_values": { + "probabilities": None, + "predicted_value": 1, + "target_value": 2, + "error_name": "sum", + "error_value": 3, + "index_id": 1, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, + { + "rank": {"prefix": "worst", "index": 1}, + "predicted_values": { + "probabilities": None, + "predicted_value": 2, + "target_value": 3, + "error_name": "sum", + "error_value": 5, + "index_id": 0, + }, + "explanations": ["explanation_dictionary_goes_here"], + }, ] } -@pytest.mark.parametrize("output_format,answer", - [("text", regression_custom_metric_answer), - ("dict", regression_custom_metric_answer_dict)]) -@patch("evalml.model_understanding.prediction_explanations._user_interface._make_single_prediction_shap_table") -def test_explain_predictions_best_worst_custom_metric(mock_make_table, output_format, answer): - - mock_make_table.return_value = "table goes here" if output_format == "text" else {"explanations": ["explanation_dictionary_goes_here"]} +@pytest.mark.parametrize( + "output_format,answer", + [ + ("text", regression_custom_metric_answer), + ("dict", regression_custom_metric_answer_dict), + ], +) +@patch( + "evalml.model_understanding.prediction_explanations._user_interface._make_single_prediction_shap_table" +) +def test_explain_predictions_best_worst_custom_metric( + mock_make_table, output_format, answer +): + + mock_make_table.return_value = ( + "table goes here" + if output_format == "text" + else {"explanations": ["explanation_dictionary_goes_here"]} + ) pipeline = MagicMock() pipeline.parameters = "Parameters go here" input_features = pd.DataFrame({"a": [5, 6]}) @@ -539,11 +1040,19 @@ def test_explain_predictions_best_worst_custom_metric(mock_make_table, output_fo def sum(y_true, y_pred): return y_pred + y_true - best_worst_report = explain_predictions_best_worst(pipeline, input_features, y_true=y_true, - num_to_explain=1, metric=sum, output_format=output_format) + best_worst_report = explain_predictions_best_worst( + pipeline, + input_features, + y_true=y_true, + num_to_explain=1, + metric=sum, + output_format=output_format, + ) if output_format == "text": - compare_two_tables(best_worst_report.splitlines(), regression_custom_metric_answer.splitlines()) + compare_two_tables( + best_worst_report.splitlines(), regression_custom_metric_answer.splitlines() + ) else: assert best_worst_report == answer @@ -551,72 +1060,120 @@ def sum(y_true, y_pred): def test_explain_predictions_time_series(ts_data): X, y = ts_data - ts_pipeline = TimeSeriesRegressionPipeline(component_graph=["Delayed Feature Transformer", "Random Forest Regressor"], - parameters={"pipeline": {"date_index": None, "gap": 1, "max_delay": 2}, - "Random Forest Regressor": {"n_jobs": 1}}) + ts_pipeline = TimeSeriesRegressionPipeline( + component_graph=["Delayed Feature Transformer", "Random Forest Regressor"], + parameters={ + "pipeline": {"date_index": None, "gap": 1, "max_delay": 2}, + "Random Forest Regressor": {"n_jobs": 1}, + }, + ) ts_pipeline.fit(X, y) - exp = explain_predictions(pipeline=ts_pipeline, input_features=X, y=y, - indices_to_explain=[5, 11], output_format="dict") + exp = explain_predictions( + pipeline=ts_pipeline, + input_features=X, + y=y, + indices_to_explain=[5, 11], + output_format="dict", + ) # Check that the computed features to be explained aren't NaN. for exp_idx in range(len(exp["explanations"])): - assert not np.isnan(np.array(exp["explanations"][exp_idx]["explanations"][0]["feature_values"])).any() + assert not np.isnan( + np.array(exp["explanations"][exp_idx]["explanations"][0]["feature_values"]) + ).any() with pytest.raises(ValueError, match="Requested index"): - explain_predictions(pipeline=ts_pipeline, input_features=X, y=y, - indices_to_explain=[1, 11], output_format="text") + explain_predictions( + pipeline=ts_pipeline, + input_features=X, + y=y, + indices_to_explain=[1, 11], + output_format="text", + ) @pytest.mark.parametrize("output_format", ["text", "dict", "dataframe"]) -@pytest.mark.parametrize("pipeline_class, estimator", [(TimeSeriesRegressionPipeline, "Random Forest Regressor"), - (TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier")]) -def test_explain_predictions_best_worst_time_series(output_format, pipeline_class, estimator, ts_data): +@pytest.mark.parametrize( + "pipeline_class, estimator", + [ + (TimeSeriesRegressionPipeline, "Random Forest Regressor"), + (TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier"), + ], +) +def test_explain_predictions_best_worst_time_series( + output_format, pipeline_class, estimator, ts_data +): X, y = ts_data if is_binary(pipeline_class.problem_type): y = y % 2 - ts_pipeline = pipeline_class(component_graph=["Delayed Feature Transformer", estimator], - parameters={"pipeline": {"date_index": None, "gap": 1, "max_delay": 2}}) + ts_pipeline = pipeline_class( + component_graph=["Delayed Feature Transformer", estimator], + parameters={"pipeline": {"date_index": None, "gap": 1, "max_delay": 2}}, + ) ts_pipeline.fit(X, y) - exp = explain_predictions_best_worst(pipeline=ts_pipeline, input_features=X, y_true=y, - output_format=output_format) + exp = explain_predictions_best_worst( + pipeline=ts_pipeline, input_features=X, y_true=y, output_format=output_format + ) if output_format == "dict": # Check that the computed features to be explained aren't NaN. for exp_idx in range(len(exp["explanations"])): - assert not np.isnan(np.array(exp["explanations"][exp_idx]["explanations"][0]["feature_values"])).any() + assert not np.isnan( + np.array( + exp["explanations"][exp_idx]["explanations"][0]["feature_values"] + ) + ).any() -@pytest.mark.parametrize("problem_type", [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -def test_json_serialization(problem_type, X_y_regression, linear_regression_pipeline_class, - X_y_binary, logistic_regression_binary_pipeline_class, - X_y_multi, logistic_regression_multiclass_pipeline_class): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS], +) +def test_json_serialization( + problem_type, + X_y_regression, + linear_regression_pipeline_class, + X_y_binary, + logistic_regression_binary_pipeline_class, + X_y_multi, + logistic_regression_multiclass_pipeline_class, +): if problem_type == problem_type.REGRESSION: X, y = X_y_regression y = pd.Series(y) - pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) + pipeline = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) elif problem_type == problem_type.BINARY: X, y = X_y_binary y = pd.Series(y).astype("str") - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) else: X, y = X_y_multi y = pd.Series(y).astype("str") - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) - best_worst = explain_predictions_best_worst(pipeline, pd.DataFrame(X), y, - num_to_explain=1, output_format="dict") + best_worst = explain_predictions_best_worst( + pipeline, pd.DataFrame(X), y, num_to_explain=1, output_format="dict" + ) assert json.loads(json.dumps(best_worst)) == best_worst - report = explain_predictions(pipeline, pd.DataFrame(X), y=y, output_format="dict", indices_to_explain=[0]) + report = explain_predictions( + pipeline, pd.DataFrame(X), y=y, output_format="dict", indices_to_explain=[0] + ) assert json.loads(json.dumps(report)) == report @@ -630,97 +1187,224 @@ def transform_y_for_problem_type(problem_type, y): return y -EXPECTED_DATETIME_FEATURES = {'datetime_hour', 'datetime_year', 'datetime_month', 'datetime_day_of_week'} +EXPECTED_DATETIME_FEATURES = { + "datetime_hour", + "datetime_year", + "datetime_month", + "datetime_day_of_week", +} -EXPECTED_DATETIME_FEATURES_OHE = {'datetime_hour', 'datetime_year', 'datetime_month_3', - 'datetime_day_of_week_0', 'datetime_day_of_week_1', 'datetime_day_of_week_2', 'datetime_day_of_week_3', - 'datetime_day_of_week_4', 'datetime_day_of_week_5', 'datetime_day_of_week_6', - 'datetime_month_0', 'datetime_month_1', 'datetime_month_2', 'datetime_month_4', - 'datetime_month_5', 'datetime_month_6', 'datetime_month_7'} +EXPECTED_DATETIME_FEATURES_OHE = { + "datetime_hour", + "datetime_year", + "datetime_month_3", + "datetime_day_of_week_0", + "datetime_day_of_week_1", + "datetime_day_of_week_2", + "datetime_day_of_week_3", + "datetime_day_of_week_4", + "datetime_day_of_week_5", + "datetime_day_of_week_6", + "datetime_month_0", + "datetime_month_1", + "datetime_month_2", + "datetime_month_4", + "datetime_month_5", + "datetime_month_6", + "datetime_month_7", +} -EXPECTED_CURRENCY_FEATURES = {'currency_XDR', 'currency_HTG', 'currency_PAB', 'currency_CNY', 'currency_TZS', - 'currency_LAK', 'currency_NAD', 'currency_IMP', 'currency_QAR', 'currency_EGP'} +EXPECTED_CURRENCY_FEATURES = { + "currency_XDR", + "currency_HTG", + "currency_PAB", + "currency_CNY", + "currency_TZS", + "currency_LAK", + "currency_NAD", + "currency_IMP", + "currency_QAR", + "currency_EGP", +} -EXPECTED_PROVIDER_FEATURES_OHE = {'provider_JCB 16 digit', 'provider_Discover', 'provider_American Express', - 'provider_JCB 15 digit', 'provider_Maestro', 'provider_VISA 19 digit', - 'provider_VISA 13 digit', 'provider_Mastercard', 'provider_VISA 16 digit', - 'provider_Diners Club / Carte Blanche'} +EXPECTED_PROVIDER_FEATURES_OHE = { + "provider_JCB 16 digit", + "provider_Discover", + "provider_American Express", + "provider_JCB 15 digit", + "provider_Maestro", + "provider_VISA 19 digit", + "provider_VISA 13 digit", + "provider_Mastercard", + "provider_VISA 16 digit", + "provider_Diners Club / Carte Blanche", +} -EXPECTED_PROVIDER_FEATURES_TEXT = {'DIVERSITY_SCORE(provider)', 'LSA(provider)[0]', 'LSA(provider)[1]', - 'MEAN_CHARACTERS_PER_WORD(provider)', 'POLARITY_SCORE(provider)'} +EXPECTED_PROVIDER_FEATURES_TEXT = { + "DIVERSITY_SCORE(provider)", + "LSA(provider)[0]", + "LSA(provider)[1]", + "MEAN_CHARACTERS_PER_WORD(provider)", + "POLARITY_SCORE(provider)", +} -pipeline_test_cases = [(BinaryClassificationPipeline, "Random Forest Classifier"), - (RegressionPipeline, "Random Forest Regressor"), - (MulticlassClassificationPipeline, "Random Forest Classifier")] +pipeline_test_cases = [ + (BinaryClassificationPipeline, "Random Forest Classifier"), + (RegressionPipeline, "Random Forest Regressor"), + (MulticlassClassificationPipeline, "Random Forest Classifier"), +] @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_100): X, y = fraud_100 - pipeline = pipeline_class(component_graph=["Select Columns Transformer", "One Hot Encoder", "DateTime Featurization Component", estimator], - parameters={"Select Columns Transformer": {'columns': ['amount', 'provider', "currency"]}, - estimator: {"n_jobs": 1}}) + pipeline = pipeline_class( + component_graph=[ + "Select Columns Transformer", + "One Hot Encoder", + "DateTime Featurization Component", + estimator, + ], + parameters={ + "Select Columns Transformer": { + "columns": ["amount", "provider", "currency"] + }, + estimator: {"n_jobs": 1}, + }, + ) y = transform_y_for_problem_type(pipeline.problem_type, y) pipeline.fit(X, y) - report = explain_predictions(pipeline, X, y, indices_to_explain=[0], output_format="dict") - for explanation in report["explanations"][0]['explanations']: - assert set(explanation['feature_names']) == {"amount", "provider", "currency"} - assert set(explanation['feature_values']) == {"CUC", "Mastercard", 24900} - assert explanation['drill_down'].keys() == {"currency", "provider"} - assert set(explanation['drill_down']['currency']['feature_names']) == EXPECTED_CURRENCY_FEATURES - assert set(explanation['drill_down']['provider']['feature_names']) == EXPECTED_PROVIDER_FEATURES_OHE + report = explain_predictions( + pipeline, X, y, indices_to_explain=[0], output_format="dict" + ) + for explanation in report["explanations"][0]["explanations"]: + assert set(explanation["feature_names"]) == {"amount", "provider", "currency"} + assert set(explanation["feature_values"]) == {"CUC", "Mastercard", 24900} + assert explanation["drill_down"].keys() == {"currency", "provider"} + assert ( + set(explanation["drill_down"]["currency"]["feature_names"]) + == EXPECTED_CURRENCY_FEATURES + ) + assert ( + set(explanation["drill_down"]["provider"]["feature_names"]) + == EXPECTED_PROVIDER_FEATURES_OHE + ) @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.set_types(logical_types={'provider': 'NaturalLanguage'}) - component_graph = ["Select Columns Transformer", "One Hot Encoder", "Text Featurization Component", "DateTime Featurization Component", estimator] + X.ww.set_types(logical_types={"provider": "NaturalLanguage"}) + component_graph = [ + "Select Columns Transformer", + "One Hot Encoder", + "Text Featurization Component", + "DateTime Featurization Component", + estimator, + ] - pipeline = pipeline_class(component_graph, - parameters={"Select Columns Transformer": {'columns': ['amount', 'provider', "currency", 'datetime']}, - estimator: {"n_jobs": 1}}) + pipeline = pipeline_class( + component_graph, + parameters={ + "Select Columns Transformer": { + "columns": ["amount", "provider", "currency", "datetime"] + }, + estimator: {"n_jobs": 1}, + }, + ) y = transform_y_for_problem_type(pipeline.problem_type, y) pipeline.fit(X, y) - report = explain_predictions(pipeline, X, y, indices_to_explain=[0], top_k_features=4, output_format="dict") - for explanation in report["explanations"][0]['explanations']: - assert set(explanation['feature_names']) == {"amount", "provider", "currency", "datetime"} - assert set(explanation['feature_values']) == {"CUC", "Mastercard", 24900, pd.Timestamp('2019-01-01 00:12:26')} - assert explanation['drill_down'].keys() == {"currency", "provider", "datetime"} - assert set(explanation['drill_down']['currency']['feature_names']) == EXPECTED_CURRENCY_FEATURES - assert set(explanation['drill_down']['provider']['feature_names']) == EXPECTED_PROVIDER_FEATURES_TEXT - assert set(explanation['drill_down']['datetime']['feature_names']) == EXPECTED_DATETIME_FEATURES + report = explain_predictions( + pipeline, X, y, indices_to_explain=[0], top_k_features=4, output_format="dict" + ) + for explanation in report["explanations"][0]["explanations"]: + assert set(explanation["feature_names"]) == { + "amount", + "provider", + "currency", + "datetime", + } + assert set(explanation["feature_values"]) == { + "CUC", + "Mastercard", + 24900, + pd.Timestamp("2019-01-01 00:12:26"), + } + assert explanation["drill_down"].keys() == {"currency", "provider", "datetime"} + assert ( + set(explanation["drill_down"]["currency"]["feature_names"]) + == EXPECTED_CURRENCY_FEATURES + ) + assert ( + set(explanation["drill_down"]["provider"]["feature_names"]) + == EXPECTED_PROVIDER_FEATURES_TEXT + ) + assert ( + set(explanation["drill_down"]["datetime"]["feature_names"]) + == EXPECTED_DATETIME_FEATURES + ) @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): X, y = fraud_100 - pipeline = pipeline_class(component_graph=["Select Columns Transformer", "DateTime Featurization Component", - "One Hot Encoder", estimator], - parameters={"Select Columns Transformer": {'columns': ['datetime', 'amount', 'provider', "currency"]}, - 'DateTime Featurization Component': {"encode_as_categories": True}, - estimator: {"n_jobs": 1}}) + pipeline = pipeline_class( + component_graph=[ + "Select Columns Transformer", + "DateTime Featurization Component", + "One Hot Encoder", + estimator, + ], + parameters={ + "Select Columns Transformer": { + "columns": ["datetime", "amount", "provider", "currency"] + }, + "DateTime Featurization Component": {"encode_as_categories": True}, + estimator: {"n_jobs": 1}, + }, + ) y = transform_y_for_problem_type(pipeline.problem_type, y) pipeline.fit(X, y) - report = explain_predictions(pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=7) + report = explain_predictions( + pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=7 + ) - for explanation in report["explanations"][0]['explanations']: - assert set(explanation['feature_names']) == {"amount", "provider", "currency", "datetime"} - assert set(explanation['feature_values']) == {pd.Timestamp('2019-01-01 00:12:26'), 'Mastercard', 'CUC', 24900} - assert explanation['drill_down'].keys() == {"currency", "provider", "datetime"} - assert set(explanation['drill_down']['datetime']['feature_names']) == EXPECTED_DATETIME_FEATURES_OHE - assert set(explanation['drill_down']['currency']['feature_names']) == EXPECTED_CURRENCY_FEATURES - assert set(explanation['drill_down']['provider']['feature_names']) == EXPECTED_PROVIDER_FEATURES_OHE + for explanation in report["explanations"][0]["explanations"]: + assert set(explanation["feature_names"]) == { + "amount", + "provider", + "currency", + "datetime", + } + assert set(explanation["feature_values"]) == { + pd.Timestamp("2019-01-01 00:12:26"), + "Mastercard", + "CUC", + 24900, + } + assert explanation["drill_down"].keys() == {"currency", "provider", "datetime"} + assert ( + set(explanation["drill_down"]["datetime"]["feature_names"]) + == EXPECTED_DATETIME_FEATURES_OHE + ) + assert ( + set(explanation["drill_down"]["currency"]["feature_names"]) + == EXPECTED_CURRENCY_FEATURES + ) + assert ( + set(explanation["drill_down"]["provider"]["feature_names"]) + == EXPECTED_PROVIDER_FEATURES_OHE + ) @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) @@ -728,88 +1412,171 @@ def test_categories_aggregated_pca_dag(pipeline_class, estimator, fraud_100): X, y = fraud_100 component_graph = { - 'SelectNumeric': ["Select Columns Transformer"], - 'SelectCategorical': ["Select Columns Transformer"], - 'SelectDate': ["Select Columns Transformer"], - 'OHE': ['One Hot Encoder', 'SelectCategorical'], - 'DT': ['DateTime Featurization Component', "SelectDate"], - 'PCA': ['PCA Transformer', 'SelectNumeric'], - 'Estimator': [estimator, 'PCA', 'DT', 'OHE'], + "SelectNumeric": ["Select Columns Transformer"], + "SelectCategorical": ["Select Columns Transformer"], + "SelectDate": ["Select Columns Transformer"], + "OHE": ["One Hot Encoder", "SelectCategorical"], + "DT": ["DateTime Featurization Component", "SelectDate"], + "PCA": ["PCA Transformer", "SelectNumeric"], + "Estimator": [estimator, "PCA", "DT", "OHE"], } - parameters = {'SelectNumeric': {'columns': ['card_id', 'store_id', 'amount', 'lat', 'lng']}, - 'SelectCategorical': {'columns': ['currency', 'provider']}, - 'SelectDate': {'columns': ['datetime']}, - 'PCA': {"n_components": 2}, - 'Estimator': {"n_jobs": 1}} - pipeline = pipeline_class(component_graph=component_graph, - parameters=parameters) + parameters = { + "SelectNumeric": {"columns": ["card_id", "store_id", "amount", "lat", "lng"]}, + "SelectCategorical": {"columns": ["currency", "provider"]}, + "SelectDate": {"columns": ["datetime"]}, + "PCA": {"n_components": 2}, + "Estimator": {"n_jobs": 1}, + } + pipeline = pipeline_class(component_graph=component_graph, parameters=parameters) y = transform_y_for_problem_type(pipeline.problem_type, y) pipeline.fit(X, y) - report = explain_predictions(pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=7) + report = explain_predictions( + pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=7 + ) for explanation in report["explanations"][0]["explanations"]: - assert set(explanation['feature_names']) == {"component_0", "component_1", "provider", "currency", "datetime"} - assert all([f in explanation['feature_values'] for f in [pd.Timestamp('2019-01-01 00:12:26'), 'Mastercard', 'CUC']]) - assert explanation['drill_down'].keys() == {"currency", "provider", "datetime"} - assert set(explanation['drill_down']['currency']['feature_names']) == EXPECTED_CURRENCY_FEATURES - assert set(explanation['drill_down']['provider']['feature_names']) == EXPECTED_PROVIDER_FEATURES_OHE - assert set(explanation['drill_down']['datetime']['feature_names']) == EXPECTED_DATETIME_FEATURES + assert set(explanation["feature_names"]) == { + "component_0", + "component_1", + "provider", + "currency", + "datetime", + } + assert all( + [ + f in explanation["feature_values"] + for f in [pd.Timestamp("2019-01-01 00:12:26"), "Mastercard", "CUC"] + ] + ) + assert explanation["drill_down"].keys() == {"currency", "provider", "datetime"} + assert ( + set(explanation["drill_down"]["currency"]["feature_names"]) + == EXPECTED_CURRENCY_FEATURES + ) + assert ( + set(explanation["drill_down"]["provider"]["feature_names"]) + == EXPECTED_PROVIDER_FEATURES_OHE + ) + assert ( + set(explanation["drill_down"]["datetime"]["feature_names"]) + == EXPECTED_DATETIME_FEATURES + ) @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) -def test_categories_aggregated_but_not_those_that_are_dropped(pipeline_class, estimator, fraud_100): +def test_categories_aggregated_but_not_those_that_are_dropped( + pipeline_class, estimator, fraud_100 +): X, y = fraud_100 - component_graph = ["Select Columns Transformer", "One Hot Encoder", - "DateTime Featurization Component", 'Drop Columns Transformer', estimator] - parameters = {"Select Columns Transformer": {'columns': ['amount', 'provider', "currency", - "datetime"]}, - "Drop Columns Transformer": {"columns": list(EXPECTED_DATETIME_FEATURES)}, - estimator: {"n_jobs": 1}} + component_graph = [ + "Select Columns Transformer", + "One Hot Encoder", + "DateTime Featurization Component", + "Drop Columns Transformer", + estimator, + ] + parameters = { + "Select Columns Transformer": { + "columns": ["amount", "provider", "currency", "datetime"] + }, + "Drop Columns Transformer": {"columns": list(EXPECTED_DATETIME_FEATURES)}, + estimator: {"n_jobs": 1}, + } pipeline = pipeline_class(component_graph=component_graph, parameters=parameters) y = transform_y_for_problem_type(pipeline.problem_type, y) pipeline.fit(X, y) - report = explain_predictions(pipeline, X, y, indices_to_explain=[0], output_format="dict") - for explanation in report["explanations"][0]['explanations']: - assert set(explanation['feature_names']) == {"amount", "provider", "currency"} - assert set(explanation['feature_values']) == {"CUC", "Mastercard", 24900} - assert explanation['drill_down'].keys() == {"currency", "provider"} - assert set(explanation['drill_down']['currency']['feature_names']) == EXPECTED_CURRENCY_FEATURES - assert set(explanation['drill_down']['provider']['feature_names']) == EXPECTED_PROVIDER_FEATURES_OHE + report = explain_predictions( + pipeline, X, y, indices_to_explain=[0], output_format="dict" + ) + for explanation in report["explanations"][0]["explanations"]: + assert set(explanation["feature_names"]) == {"amount", "provider", "currency"} + assert set(explanation["feature_values"]) == {"CUC", "Mastercard", 24900} + assert explanation["drill_down"].keys() == {"currency", "provider"} + assert ( + set(explanation["drill_down"]["currency"]["feature_names"]) + == EXPECTED_CURRENCY_FEATURES + ) + assert ( + set(explanation["drill_down"]["provider"]["feature_names"]) + == EXPECTED_PROVIDER_FEATURES_OHE + ) @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) -def test_categories_aggregated_when_some_are_dropped(pipeline_class, estimator, fraud_100): +def test_categories_aggregated_when_some_are_dropped( + pipeline_class, estimator, fraud_100 +): X, y = fraud_100 - component_graph = ["Select Columns Transformer", "One Hot Encoder", "DateTime Featurization Component", 'Drop Columns Transformer', estimator] - parameters = {"Select Columns Transformer": {'columns': ['amount', 'provider', "currency", - "datetime"]}, - "Drop Columns Transformer": {"columns": ["datetime_month", "datetime_hour"]}, - estimator: {"n_jobs": 1}} + component_graph = [ + "Select Columns Transformer", + "One Hot Encoder", + "DateTime Featurization Component", + "Drop Columns Transformer", + estimator, + ] + parameters = { + "Select Columns Transformer": { + "columns": ["amount", "provider", "currency", "datetime"] + }, + "Drop Columns Transformer": {"columns": ["datetime_month", "datetime_hour"]}, + estimator: {"n_jobs": 1}, + } pipeline = pipeline_class(component_graph=component_graph, parameters=parameters) y = transform_y_for_problem_type(pipeline.problem_type, y) pipeline.fit(X, y) - report = explain_predictions(pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=4) - for explanation in report["explanations"][0]['explanations']: - assert set(explanation['feature_names']) == {"amount", "provider", "currency", "datetime"} - assert set(explanation['feature_values']) == {"CUC", "Mastercard", 24900, pd.Timestamp('2019-01-01 00:12:26')} - assert explanation['drill_down'].keys() == {"currency", "provider", "datetime"} - assert set(explanation['drill_down']['currency']['feature_names']) == EXPECTED_CURRENCY_FEATURES - assert set(explanation['drill_down']['provider']['feature_names']) == EXPECTED_PROVIDER_FEATURES_OHE - assert set(explanation['drill_down']['datetime']['feature_names']) == {"datetime_year", "datetime_day_of_week"} - - -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_explain_predictions_stacked_ensemble(problem_type, dummy_stacked_ensemble_binary_estimator, dummy_stacked_ensemble_multiclass_estimator, - dummy_stacked_ensemble_regressor_estimator, X_y_binary, X_y_multi, X_y_regression): + report = explain_predictions( + pipeline, X, y, indices_to_explain=[0], output_format="dict", top_k_features=4 + ) + for explanation in report["explanations"][0]["explanations"]: + assert set(explanation["feature_names"]) == { + "amount", + "provider", + "currency", + "datetime", + } + assert set(explanation["feature_values"]) == { + "CUC", + "Mastercard", + 24900, + pd.Timestamp("2019-01-01 00:12:26"), + } + assert explanation["drill_down"].keys() == {"currency", "provider", "datetime"} + assert ( + set(explanation["drill_down"]["currency"]["feature_names"]) + == EXPECTED_CURRENCY_FEATURES + ) + assert ( + set(explanation["drill_down"]["provider"]["feature_names"]) + == EXPECTED_PROVIDER_FEATURES_OHE + ) + assert set(explanation["drill_down"]["datetime"]["feature_names"]) == { + "datetime_year", + "datetime_day_of_week", + } + + +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_explain_predictions_stacked_ensemble( + problem_type, + dummy_stacked_ensemble_binary_estimator, + dummy_stacked_ensemble_multiclass_estimator, + dummy_stacked_ensemble_regressor_estimator, + X_y_binary, + X_y_multi, + X_y_regression, +): if is_binary(problem_type): X, y = X_y_binary pipeline = dummy_stacked_ensemble_binary_estimator @@ -820,25 +1587,61 @@ def test_explain_predictions_stacked_ensemble(problem_type, dummy_stacked_ensemb X, y = X_y_regression pipeline = dummy_stacked_ensemble_regressor_estimator - with pytest.raises(ValueError, match="Cannot explain predictions for a stacked ensemble pipeline"): + with pytest.raises( + ValueError, match="Cannot explain predictions for a stacked ensemble pipeline" + ): explain_predictions(pipeline, X, y, indices_to_explain=[0]) - with pytest.raises(ValueError, match="Cannot explain predictions for a stacked ensemble pipeline"): + with pytest.raises( + ValueError, match="Cannot explain predictions for a stacked ensemble pipeline" + ): explain_predictions_best_worst(pipeline, X, y) -@pytest.mark.parametrize("estimator", [e for e in _all_estimators() if ('Classifier' in e.name and not any(s in e.name for s in ["Baseline", "Cat", "Elastic", "KN", "Ensemble"]))]) +@pytest.mark.parametrize( + "estimator", + [ + e + for e in _all_estimators() + if ( + "Classifier" in e.name + and not any( + s in e.name for s in ["Baseline", "Cat", "Elastic", "KN", "Ensemble"] + ) + ) + ], +) def test_explain_predictions_oversampler(estimator, fraud_100): - pytest.importorskip('imblearn.over_sampling', reason='Skipping test because imbalanced-learn not installed') + pytest.importorskip( + "imblearn.over_sampling", + reason="Skipping test because imbalanced-learn not installed", + ) X, y = fraud_100 - pipeline = BinaryClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "DateTime Featurization Component", "SMOTENC Oversampler", estimator]) + pipeline = BinaryClassificationPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "SMOTENC Oversampler", + estimator, + ] + ) pipeline.fit(X, y) - report = explain_predictions(pipeline, X, y, indices_to_explain=[0], output_format="dataframe", top_k_features=4) - assert report['feature_names'].isnull().sum() == 0 - assert report['feature_values'].isnull().sum() == 0 - - -@patch("evalml.model_understanding.prediction_explanations._user_interface._make_single_prediction_shap_table") + report = explain_predictions( + pipeline, + X, + y, + indices_to_explain=[0], + output_format="dataframe", + top_k_features=4, + ) + assert report["feature_names"].isnull().sum() == 0 + assert report["feature_values"].isnull().sum() == 0 + + +@patch( + "evalml.model_understanding.prediction_explanations._user_interface._make_single_prediction_shap_table" +) def test_explain_predictions_best_worst_callback(mock_make_table): pipeline = MagicMock() pipeline.parameters = "Mock parameters" @@ -860,6 +1663,8 @@ def __call__(self, progress_stage, time_elapsed): self.total_elapsed_time = time_elapsed mock_callback = MockCallback() - explain_predictions_best_worst(pipeline, input_features, y_true, num_to_explain=1, callback=mock_callback) + explain_predictions_best_worst( + pipeline, input_features, y_true, num_to_explain=1, callback=mock_callback + ) assert mock_callback.progress_stages == [e for e in ExplainPredictionsStage] assert mock_callback.total_elapsed_time > 0 diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_user_interface.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_user_interface.py index 8f06618901..530d8861ae 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_user_interface.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_user_interface.py @@ -12,25 +12,49 @@ _make_rows, _make_text_table, _MultiClassSHAPTable, - _RegressionSHAPTable + _RegressionSHAPTable, ) -make_rows_test_cases = [({"a": [0.2], "b": [0.1]}, 3, [["a", "1.20", "++"], ["b", "1.10", "+"]]), - ({"a": [0.3], "b": [-0.9], "c": [0.5], - "d": [0.33], "e": [-0.67], "f": [-0.2], - "g": [0.71]}, 4, - [["g", "1.71", "++++"], ["c", "1.50", "+++"], - ["e", "0.33", "----"], ["b", "0.10", "-----"]]), - ({"a": [1.0], "f": [-1.0], "e": [0.0]}, 5, - [["a", "2.00", "+++++"], ["e", "1.00", "+"], ["f", "0.00", "-----"]])] - - -@pytest.mark.parametrize("test_case,include_shap_values,include_string_features", - product(make_rows_test_cases, [True, False], [True, False])) -def test_make_rows_and_make_table(test_case, include_shap_values, include_string_features): +make_rows_test_cases = [ + ({"a": [0.2], "b": [0.1]}, 3, [["a", "1.20", "++"], ["b", "1.10", "+"]]), + ( + { + "a": [0.3], + "b": [-0.9], + "c": [0.5], + "d": [0.33], + "e": [-0.67], + "f": [-0.2], + "g": [0.71], + }, + 4, + [ + ["g", "1.71", "++++"], + ["c", "1.50", "+++"], + ["e", "0.33", "----"], + ["b", "0.10", "-----"], + ], + ), + ( + {"a": [1.0], "f": [-1.0], "e": [0.0]}, + 5, + [["a", "2.00", "+++++"], ["e", "1.00", "+"], ["f", "0.00", "-----"]], + ), +] + + +@pytest.mark.parametrize( + "test_case,include_shap_values,include_string_features", + product(make_rows_test_cases, [True, False], [True, False]), +) +def test_make_rows_and_make_table( + test_case, include_shap_values, include_string_features +): values, top_k, answer = test_case - pipeline_features = pd.DataFrame({name: value[0] + 1 for name, value in values.items()}, index=[5]) + pipeline_features = pd.DataFrame( + {name: value[0] + 1 for name, value in values.items()}, index=[5] + ) if include_string_features: pipeline_features["a"] = ["foo-feature"] @@ -55,18 +79,36 @@ def test_make_rows_and_make_table(test_case, include_shap_values, include_string filtered_answer[-1][1] = val new_answer = filtered_answer - assert _make_rows(values, values, pipeline_features, pipeline_features, top_k, include_shap_values) == new_answer - - table = _make_text_table(values, values, pipeline_features, pipeline_features, top_k, include_shap_values).splitlines() + assert ( + _make_rows( + values, + values, + pipeline_features, + pipeline_features, + top_k, + include_shap_values, + ) + == new_answer + ) + + table = _make_text_table( + values, values, pipeline_features, pipeline_features, top_k, include_shap_values + ).splitlines() if include_shap_values: assert "SHAP Value" in table[0] # Subtracting two because a header and a line under the header are included in the table. assert len(table) - 2 == len(new_answer) -@pytest.mark.parametrize("value,answer", [(np.int64(3), 3), (np.float32(3.2), 3.2), - (np.str_("foo"), "foo"), - (np.bool_(True), True)]) +@pytest.mark.parametrize( + "value,answer", + [ + (np.int64(3), 3), + (np.float32(3.2), 3.2), + (np.str_("foo"), "foo"), + (np.bool_(True), True), + ], +) def test_make_json_serializable(value, answer): value = _make_json_serializable(value) if answer != "foo": @@ -76,18 +118,54 @@ def test_make_json_serializable(value, answer): json.dumps(value) -regression = {"a": [6.500], "b": [1.770], "c": [0.570], - "d": [-0.090], "e": [-0.290], "f": [-1.910], - "foo": [0.01], "bar": [-0.02]} +regression = { + "a": [6.500], + "b": [1.770], + "c": [0.570], + "d": [-0.090], + "e": [-0.290], + "f": [-1.910], + "foo": [0.01], + "bar": [-0.02], +} -regression_normalized = {'a': [0.6214], 'b': [0.1692], 'bar': [-0.0019], - 'c': [0.0544], 'd': [-0.0086], 'e': [-0.0277], - 'f': [-0.8], 'foo': [0.0001]} +regression_normalized = { + "a": [0.6214], + "b": [0.1692], + "bar": [-0.0019], + "c": [0.0544], + "d": [-0.0086], + "e": [-0.0277], + "f": [-0.8], + "foo": [0.0001], +} -regression_pipeline_features = pd.DataFrame({"a": 7.5, "b": 2.77, "c": 1.57, "d": 0.91, "e": 0.71, "f": -0.21, - "foo": -20, "bar": -30}, index=[31]) -regression_original_features = pd.DataFrame({"a": 0.75, "b": 0.277, "c": 0.57, "d": 1.91, "e": 1.71, "f": -1.21, - "foo": -20, "bar": -40}, index=[31]) +regression_pipeline_features = pd.DataFrame( + { + "a": 7.5, + "b": 2.77, + "c": 1.57, + "d": 0.91, + "e": 0.71, + "f": -0.21, + "foo": -20, + "bar": -30, + }, + index=[31], +) +regression_original_features = pd.DataFrame( + { + "a": 0.75, + "b": 0.277, + "c": 0.57, + "d": 1.91, + "e": 1.71, + "f": -1.21, + "foo": -20, + "bar": -40, + }, + index=[31], +) regression_table = """Feature Name Feature Value Contribution to Prediction ========================================================= @@ -102,40 +180,73 @@ def test_make_json_serializable(value, answer): f -0.21 ----- -1.91""".splitlines() regression_dict = { - "explanations": [{ - "feature_names": ["a", "b", "f"], - "feature_values": [7.5, 2.77, -0.21], - "qualitative_explanation": ["++++", "+", "-----"], - "quantitative_explanation": [None, None, None], - "drill_down": {}, - "class_name": None - }] + "explanations": [ + { + "feature_names": ["a", "b", "f"], + "feature_values": [7.5, 2.77, -0.21], + "qualitative_explanation": ["++++", "+", "-----"], + "quantitative_explanation": [None, None, None], + "drill_down": {}, + "class_name": None, + } + ] } regression_dict_shap = { - "explanations": [{ - "feature_names": ["a", "b", "f"], - "feature_values": [7.5, 2.77, -0.21], - "qualitative_explanation": ["++++", "+", "-----"], - "quantitative_explanation": [6.50, 1.77, -1.91], - "drill_down": {}, - "class_name": None - }] + "explanations": [ + { + "feature_names": ["a", "b", "f"], + "feature_values": [7.5, 2.77, -0.21], + "qualitative_explanation": ["++++", "+", "-----"], + "quantitative_explanation": [6.50, 1.77, -1.91], + "drill_down": {}, + "class_name": None, + } + ] } -binary = [{"a": [0], "b": [0], "c": [0], - "d": [0], "e": [0], "f": [0], "foo": [-1]}, - {"a": [1.180], "b": [0.0], "c": [1.120], - "d": [-0.560], "e": [-2.600], "f": [-0.900], "foo": [-1]}] - -binary_normalized = [{'a': [0.0], 'b': [0.0], 'c': [0.0], 'd': [0.0], 'e': [0.0], 'f': [0.0], 'foo': [-1.0]}, - {'a': [0.16], 'b': [0.0], 'c': [0.15], - 'd': [-0.08], 'e': [-0.35], 'f': [-0.12], 'foo': [-0.14]}] - -binary_pipeline_features = pd.DataFrame({"a": 2.18, "b": 2.12, "c": 1.0, "d": -1.56, "e": -1.8, "f": -1.9, - "foo": -20}, index=[23]) -binary_original_features = pd.DataFrame({"a": 1.18, "b": 1.12, "c": 2.0, "d": -2.56, "e": -2.8, "f": -2.9, - "foo": -30}, index=[23]) +binary = [ + {"a": [0], "b": [0], "c": [0], "d": [0], "e": [0], "f": [0], "foo": [-1]}, + { + "a": [1.180], + "b": [0.0], + "c": [1.120], + "d": [-0.560], + "e": [-2.600], + "f": [-0.900], + "foo": [-1], + }, +] + +binary_normalized = [ + { + "a": [0.0], + "b": [0.0], + "c": [0.0], + "d": [0.0], + "e": [0.0], + "f": [0.0], + "foo": [-1.0], + }, + { + "a": [0.16], + "b": [0.0], + "c": [0.15], + "d": [-0.08], + "e": [-0.35], + "f": [-0.12], + "foo": [-0.14], + }, +] + +binary_pipeline_features = pd.DataFrame( + {"a": 2.18, "b": 2.12, "c": 1.0, "d": -1.56, "e": -1.8, "f": -1.9, "foo": -20}, + index=[23], +) +binary_original_features = pd.DataFrame( + {"a": 1.18, "b": 1.12, "c": 2.0, "d": -2.56, "e": -2.8, "f": -2.9, "foo": -30}, + index=[23], +) binary_table = """Feature Name Feature Value Contribution to Prediction ========================================================= @@ -150,41 +261,90 @@ def test_make_json_serializable(value, answer): e -1.80 -- -2.60""".splitlines() binary_dict = { - "explanations": [{ - "feature_names": ["a", "c", "e"], - "feature_values": [2.180, 1.0, -1.80], - "qualitative_explanation": ["+", "+", "--"], - "quantitative_explanation": [None, None, None], - "drill_down": {}, - "class_name": "1" - }] + "explanations": [ + { + "feature_names": ["a", "c", "e"], + "feature_values": [2.180, 1.0, -1.80], + "qualitative_explanation": ["+", "+", "--"], + "quantitative_explanation": [None, None, None], + "drill_down": {}, + "class_name": "1", + } + ] } binary_dict_shap = { - "explanations": [{ - "feature_names": ["a", "c", "e"], - "feature_values": [2.180, 1.0, -1.80], - "qualitative_explanation": ["+", "+", "--"], - "quantitative_explanation": [1.180, 1.120, -2.60], - "drill_down": {}, - "class_name": "1" - }] + "explanations": [ + { + "feature_names": ["a", "c", "e"], + "feature_values": [2.180, 1.0, -1.80], + "qualitative_explanation": ["+", "+", "--"], + "quantitative_explanation": [1.180, 1.120, -2.60], + "drill_down": {}, + "class_name": "1", + } + ] } -multiclass = [{"a": [0], "b": [0], "c": [0], - "d": [0.11], "e": [0.18], "f": [0], "foo": [-1]}, - {"a": [1.180], "b": [1.120], "c": [0.000], "d": [-2.560], - "e": [-2.800], "f": [-2.900], "foo": [-1]}, - {"a": [0.680], "b": [0.000], "c": [0.000], - "d": [-2.040], "e": [-1.840], "f": [-2.680], "foo": [-1]}] - -multiclass_normalized = [{'a': [0.0], 'b': [0.0], 'c': [0.0], 'd': [0.07], 'e': [0.08], 'f': [0.0], 'foo': [-1.0]}, - {'a': [0.102], 'b': [0.097], 'c': [0.0], 'd': [-0.221], 'e': [-0.242], 'f': [-0.251], 'foo': [-0.0865]}, - {'a': [0.08], 'b': [0.0], 'c': [0.0], 'd': [-0.25], 'e': [-0.22], 'f': [-0.33], 'foo': [-0.12]}] -multiclass_pipeline_features = pd.DataFrame({"a": 2.18, "b": 2.12, "c": 1.0, "d": -1.56, "e": -1.8, "f": -1.9, - "foo": 30}, index=[10]) -multiclass_original_features = pd.DataFrame({"a": 1.18, "b": 1.12, "c": 2.0, "d": -2.56, "e": -4.8, "f": -5.9, - "foo": 40}, index=[10]) +multiclass = [ + {"a": [0], "b": [0], "c": [0], "d": [0.11], "e": [0.18], "f": [0], "foo": [-1]}, + { + "a": [1.180], + "b": [1.120], + "c": [0.000], + "d": [-2.560], + "e": [-2.800], + "f": [-2.900], + "foo": [-1], + }, + { + "a": [0.680], + "b": [0.000], + "c": [0.000], + "d": [-2.040], + "e": [-1.840], + "f": [-2.680], + "foo": [-1], + }, +] + +multiclass_normalized = [ + { + "a": [0.0], + "b": [0.0], + "c": [0.0], + "d": [0.07], + "e": [0.08], + "f": [0.0], + "foo": [-1.0], + }, + { + "a": [0.102], + "b": [0.097], + "c": [0.0], + "d": [-0.221], + "e": [-0.242], + "f": [-0.251], + "foo": [-0.0865], + }, + { + "a": [0.08], + "b": [0.0], + "c": [0.0], + "d": [-0.25], + "e": [-0.22], + "f": [-0.33], + "foo": [-0.12], + }, +] +multiclass_pipeline_features = pd.DataFrame( + {"a": 2.18, "b": 2.12, "c": 1.0, "d": -1.56, "e": -1.8, "f": -1.9, "foo": 30}, + index=[10], +) +multiclass_original_features = pd.DataFrame( + {"a": 1.18, "b": 1.12, "c": 2.0, "d": -2.56, "e": -4.8, "f": -5.9, "foo": 40}, + index=[10], +) multiclass_table = """Class: 0 @@ -240,92 +400,226 @@ def test_make_json_serializable(value, answer): multiclass_dict = { "explanations": [ - {"feature_names": ["e", "d", "foo"], - "feature_values": [-1.8, -1.56, 30], - "qualitative_explanation": ["+", "+", "-----"], - "quantitative_explanation": [None, None, None], - "drill_down": {}, - "class_name": "0"}, - {"feature_names": ["d", "e", "f"], - "feature_values": [-1.56, -1.8, -1.9], - "qualitative_explanation": ["--", "--", "--"], - "quantitative_explanation": [None, None, None], - "drill_down": {}, - "class_name": "1"}, - {"feature_names": ["e", "d", "f"], - "feature_values": [-1.8, -1.56, -1.9], - "qualitative_explanation": ["--", "--", "--"], - "quantitative_explanation": [None, None, None], - "drill_down": {}, - "class_name": "2"} + { + "feature_names": ["e", "d", "foo"], + "feature_values": [-1.8, -1.56, 30], + "qualitative_explanation": ["+", "+", "-----"], + "quantitative_explanation": [None, None, None], + "drill_down": {}, + "class_name": "0", + }, + { + "feature_names": ["d", "e", "f"], + "feature_values": [-1.56, -1.8, -1.9], + "qualitative_explanation": ["--", "--", "--"], + "quantitative_explanation": [None, None, None], + "drill_down": {}, + "class_name": "1", + }, + { + "feature_names": ["e", "d", "f"], + "feature_values": [-1.8, -1.56, -1.9], + "qualitative_explanation": ["--", "--", "--"], + "quantitative_explanation": [None, None, None], + "drill_down": {}, + "class_name": "2", + }, ] } multiclass_dict_shap = { "explanations": [ - {"feature_names": ["e", "d", "foo"], - "feature_values": [-1.8, -1.56, 30], - "qualitative_explanation": ["+", "+", "-----"], - "quantitative_explanation": [0.18, 0.11, -1], - "drill_down": {}, - "class_name": "0"}, - {"feature_names": ["d", "e", "f"], - "feature_values": [-1.56, -1.8, -1.9], - "qualitative_explanation": ["--", "--", "--"], - "quantitative_explanation": [-2.56, -2.8, -2.9], - "drill_down": {}, - "class_name": "1"}, - {"feature_names": ["e", "d", "f"], - "feature_values": [-1.8, -1.56, -1.9], - "qualitative_explanation": ["--", "--", "--"], - "quantitative_explanation": [-1.84, -2.04, -2.68], - "drill_down": {}, - "class_name": "2"} + { + "feature_names": ["e", "d", "foo"], + "feature_values": [-1.8, -1.56, 30], + "qualitative_explanation": ["+", "+", "-----"], + "quantitative_explanation": [0.18, 0.11, -1], + "drill_down": {}, + "class_name": "0", + }, + { + "feature_names": ["d", "e", "f"], + "feature_values": [-1.56, -1.8, -1.9], + "qualitative_explanation": ["--", "--", "--"], + "quantitative_explanation": [-2.56, -2.8, -2.9], + "drill_down": {}, + "class_name": "1", + }, + { + "feature_names": ["e", "d", "f"], + "feature_values": [-1.8, -1.56, -1.9], + "qualitative_explanation": ["--", "--", "--"], + "quantitative_explanation": [-1.84, -2.04, -2.68], + "drill_down": {}, + "class_name": "2", + }, ] } -@pytest.mark.parametrize("values,normalized_values,pipeline_features,original_features,include_shap,output_format,answer", - [(regression, regression_normalized, regression_pipeline_features, regression_original_features, False, "text", regression_table), - (regression, regression_normalized, regression_pipeline_features, regression_original_features, True, "text", regression_table_shap), - (regression, regression_normalized, regression_pipeline_features, regression_original_features, False, "dict", regression_dict), - (regression, regression_normalized, regression_pipeline_features, regression_original_features, True, "dict", regression_dict_shap), - (binary, binary_normalized, binary_pipeline_features, binary_original_features, False, "text", binary_table), - (binary, binary_normalized, binary_pipeline_features, binary_original_features, True, "text", binary_table_shap), - (binary, binary_normalized, binary_pipeline_features, binary_original_features, False, "dict", binary_dict), - (binary, binary_normalized, binary_pipeline_features, binary_original_features, True, "dict", binary_dict_shap), - (multiclass, multiclass_normalized, multiclass_pipeline_features, multiclass_original_features, False, "text", multiclass_table), - (multiclass, multiclass_normalized, multiclass_pipeline_features, multiclass_original_features, True, "text", multiclass_table_shap), - (multiclass, multiclass_normalized, multiclass_pipeline_features, multiclass_original_features, False, "dict", multiclass_dict), - (multiclass, multiclass_normalized, multiclass_pipeline_features, multiclass_original_features, True, "dict", multiclass_dict_shap) - ]) -def test_make_single_prediction_table(values, normalized_values, pipeline_features, original_features, - include_shap, output_format, answer): +@pytest.mark.parametrize( + "values,normalized_values,pipeline_features,original_features,include_shap,output_format,answer", + [ + ( + regression, + regression_normalized, + regression_pipeline_features, + regression_original_features, + False, + "text", + regression_table, + ), + ( + regression, + regression_normalized, + regression_pipeline_features, + regression_original_features, + True, + "text", + regression_table_shap, + ), + ( + regression, + regression_normalized, + regression_pipeline_features, + regression_original_features, + False, + "dict", + regression_dict, + ), + ( + regression, + regression_normalized, + regression_pipeline_features, + regression_original_features, + True, + "dict", + regression_dict_shap, + ), + ( + binary, + binary_normalized, + binary_pipeline_features, + binary_original_features, + False, + "text", + binary_table, + ), + ( + binary, + binary_normalized, + binary_pipeline_features, + binary_original_features, + True, + "text", + binary_table_shap, + ), + ( + binary, + binary_normalized, + binary_pipeline_features, + binary_original_features, + False, + "dict", + binary_dict, + ), + ( + binary, + binary_normalized, + binary_pipeline_features, + binary_original_features, + True, + "dict", + binary_dict_shap, + ), + ( + multiclass, + multiclass_normalized, + multiclass_pipeline_features, + multiclass_original_features, + False, + "text", + multiclass_table, + ), + ( + multiclass, + multiclass_normalized, + multiclass_pipeline_features, + multiclass_original_features, + True, + "text", + multiclass_table_shap, + ), + ( + multiclass, + multiclass_normalized, + multiclass_pipeline_features, + multiclass_original_features, + False, + "dict", + multiclass_dict, + ), + ( + multiclass, + multiclass_normalized, + multiclass_pipeline_features, + multiclass_original_features, + True, + "dict", + multiclass_dict_shap, + ), + ], +) +def test_make_single_prediction_table( + values, + normalized_values, + pipeline_features, + original_features, + include_shap, + output_format, + answer, +): class_names = ["0", "1", "2"] if isinstance(values, list): if len(values) > 2: - table_maker = _MultiClassSHAPTable(top_k=3, include_shap_values=include_shap, - class_names=class_names, provenance={}) + table_maker = _MultiClassSHAPTable( + top_k=3, + include_shap_values=include_shap, + class_names=class_names, + provenance={}, + ) else: - table_maker = _BinarySHAPTable(class_names=class_names, top_k=3, include_shap_values=include_shap, - provenance={}) + table_maker = _BinarySHAPTable( + class_names=class_names, + top_k=3, + include_shap_values=include_shap, + provenance={}, + ) else: - table_maker = _RegressionSHAPTable(top_k=3, include_shap_values=include_shap, provenance={}) - - table_maker = table_maker.make_text if output_format == "text" else table_maker.make_dict - - table = table_maker(aggregated_shap_values=values, - aggregated_normalized_values=normalized_values, - shap_values=values, - normalized_values=normalized_values, - pipeline_features=pipeline_features, - original_features=pipeline_features) + table_maker = _RegressionSHAPTable( + top_k=3, include_shap_values=include_shap, provenance={} + ) + + table_maker = ( + table_maker.make_text if output_format == "text" else table_maker.make_dict + ) + + table = table_maker( + aggregated_shap_values=values, + aggregated_normalized_values=normalized_values, + shap_values=values, + normalized_values=normalized_values, + pipeline_features=pipeline_features, + original_features=pipeline_features, + ) # Making sure the content is the same, regardless of formatting. if output_format == "text": - for index, (row_table, row_answer) in enumerate(zip(table.splitlines(), answer)): + for index, (row_table, row_answer) in enumerate( + zip(table.splitlines(), answer) + ): assert row_table.strip().split() == row_answer.strip().split() else: assert table == answer diff --git a/evalml/tests/model_understanding_tests/test_graphs.py b/evalml/tests/model_understanding_tests/test_graphs.py index 4aa46c7673..c5a32b437c 100644 --- a/evalml/tests/model_understanding_tests/test_graphs.py +++ b/evalml/tests/model_understanding_tests/test_graphs.py @@ -33,7 +33,7 @@ precision_recall_curve, roc_curve, t_sne, - visualize_decision_tree + visualize_decision_tree, ) from evalml.objectives import CostBenefitMatrix from evalml.pipelines import ( @@ -42,7 +42,7 @@ ElasticNetRegressor, LinearRegressor, MulticlassClassificationPipeline, - RegressionPipeline + RegressionPipeline, ) from evalml.problem_types import ProblemTypes from evalml.utils import get_random_state, infer_feature_types @@ -51,7 +51,12 @@ @pytest.fixture def test_pipeline(): class TestPipeline(BinaryClassificationPipeline): - component_graph = ['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'] + component_graph = [ + "Simple Imputer", + "One Hot Encoder", + "Standard Scaler", + "Logistic Regression Classifier", + ] def __init__(self, parameters): super().__init__(self.component_graph, parameters=parameters) @@ -59,7 +64,7 @@ def __init__(self, parameters): return TestPipeline(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_confusion_matrix(data_type, make_data_type): y_true = np.array([2, 0, 2, 2, 0, 1, 1, 0, 2]) y_predicted = np.array([0, 0, 2, 2, 0, 2, 1, 1, 1]) @@ -71,26 +76,32 @@ def test_confusion_matrix(data_type, make_data_type): assert np.array_equal(conf_mat_expected, conf_mat.to_numpy()) assert isinstance(conf_mat, pd.DataFrame) - conf_mat = confusion_matrix(y_true, y_predicted, normalize_method='all') + conf_mat = confusion_matrix(y_true, y_predicted, normalize_method="all") conf_mat_expected = conf_mat_expected / 9.0 assert np.array_equal(conf_mat_expected, conf_mat.to_numpy()) assert isinstance(conf_mat, pd.DataFrame) - conf_mat = confusion_matrix(y_true, y_predicted, normalize_method='true') - conf_mat_expected = np.array([[2 / 3.0, 1 / 3.0, 0], [0, 0.5, 0.5], [0.25, 0.25, 0.5]]) + conf_mat = confusion_matrix(y_true, y_predicted, normalize_method="true") + conf_mat_expected = np.array( + [[2 / 3.0, 1 / 3.0, 0], [0, 0.5, 0.5], [0.25, 0.25, 0.5]] + ) assert np.array_equal(conf_mat_expected, conf_mat.to_numpy()) assert isinstance(conf_mat, pd.DataFrame) - conf_mat = confusion_matrix(y_true, y_predicted, normalize_method='pred') - conf_mat_expected = np.array([[2 / 3.0, 1 / 3.0, 0], [0, 1 / 3.0, 1 / 3.0], [1 / 3.0, 1 / 3.0, 2 / 3.0]]) + conf_mat = confusion_matrix(y_true, y_predicted, normalize_method="pred") + conf_mat_expected = np.array( + [[2 / 3.0, 1 / 3.0, 0], [0, 1 / 3.0, 1 / 3.0], [1 / 3.0, 1 / 3.0, 2 / 3.0]] + ) assert np.allclose(conf_mat_expected, conf_mat.to_numpy(), equal_nan=True) assert isinstance(conf_mat, pd.DataFrame) - with pytest.raises(ValueError, match='Invalid value provided'): - conf_mat = confusion_matrix(y_true, y_predicted, normalize_method='Invalid Option') + with pytest.raises(ValueError, match="Invalid value provided"): + conf_mat = confusion_matrix( + y_true, y_predicted, normalize_method="Invalid Option" + ) -@pytest.mark.parametrize("data_type", ['ww', 'np', 'pd']) +@pytest.mark.parametrize("data_type", ["ww", "np", "pd"]) def test_normalize_confusion_matrix(data_type, make_data_type): conf_mat = np.array([[2, 3, 0], [0, 1, 1], [1, 0, 2]]) conf_mat = make_data_type(data_type, conf_mat) @@ -99,11 +110,11 @@ def test_normalize_confusion_matrix(data_type, make_data_type): assert all(conf_mat_normalized.sum(axis=1) == 1.0) assert isinstance(conf_mat_normalized, pd.DataFrame) - conf_mat_normalized = normalize_confusion_matrix(conf_mat, 'pred') + conf_mat_normalized = normalize_confusion_matrix(conf_mat, "pred") for col_sum in conf_mat_normalized.sum(axis=0): assert col_sum == 1.0 or col_sum == 0.0 - conf_mat_normalized = normalize_confusion_matrix(conf_mat, 'all') + conf_mat_normalized = normalize_confusion_matrix(conf_mat, "all") assert conf_mat_normalized.sum().sum() == 1.0 # testing with named pd.DataFrames @@ -113,37 +124,40 @@ def test_normalize_confusion_matrix(data_type, make_data_type): conf_mat_df["col_3"] = [2, 0, 0] conf_mat_normalized = normalize_confusion_matrix(conf_mat_df) assert all(conf_mat_normalized.sum(axis=1) == 1.0) - assert list(conf_mat_normalized.columns) == ['col_1', 'col_2', 'col_3'] + assert list(conf_mat_normalized.columns) == ["col_1", "col_2", "col_3"] - conf_mat_normalized = normalize_confusion_matrix(conf_mat_df, 'pred') + conf_mat_normalized = normalize_confusion_matrix(conf_mat_df, "pred") for col_sum in conf_mat_normalized.sum(axis=0): assert col_sum == 1.0 or col_sum == 0.0 - conf_mat_normalized = normalize_confusion_matrix(conf_mat_df, 'all') + conf_mat_normalized = normalize_confusion_matrix(conf_mat_df, "all") assert conf_mat_normalized.sum().sum() == 1.0 -@pytest.mark.parametrize("data_type", ['ww', 'np', 'pd']) +@pytest.mark.parametrize("data_type", ["ww", "np", "pd"]) def test_normalize_confusion_matrix_error(data_type, make_data_type): conf_mat = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]]) conf_mat = make_data_type(data_type, conf_mat) - warnings.simplefilter('default', category=RuntimeWarning) + warnings.simplefilter("default", category=RuntimeWarning) - with pytest.raises(ValueError, match='Invalid value provided for "normalize_method": invalid option'): - normalize_confusion_matrix(conf_mat, normalize_method='invalid option') - with pytest.raises(ValueError, match='Invalid value provided'): + with pytest.raises( + ValueError, + match='Invalid value provided for "normalize_method": invalid option', + ): + normalize_confusion_matrix(conf_mat, normalize_method="invalid option") + with pytest.raises(ValueError, match="Invalid value provided"): normalize_confusion_matrix(conf_mat, normalize_method=None) with pytest.raises(ValueError, match="Sum of given axis is 0"): - normalize_confusion_matrix(conf_mat, 'true') + normalize_confusion_matrix(conf_mat, "true") with pytest.raises(ValueError, match="Sum of given axis is 0"): - normalize_confusion_matrix(conf_mat, 'pred') + normalize_confusion_matrix(conf_mat, "pred") with pytest.raises(ValueError, match="Sum of given axis is 0"): - normalize_confusion_matrix(conf_mat, 'all') + normalize_confusion_matrix(conf_mat, "all") -@pytest.mark.parametrize("data_type", ['ww', 'pd', 'np']) +@pytest.mark.parametrize("data_type", ["ww", "pd", "np"]) def test_confusion_matrix_labels(data_type, make_data_type): y_true = np.array([True, False, True, True, False, False]) y_pred = np.array([False, False, True, True, False, False]) @@ -164,21 +178,21 @@ def test_confusion_matrix_labels(data_type, make_data_type): assert np.array_equal(conf_mat.index, labels) assert np.array_equal(conf_mat.columns, labels) - y_true = np.array(['blue', 'red', 'blue', 'red']) - y_pred = np.array(['blue', 'red', 'red', 'red']) + y_true = np.array(["blue", "red", "blue", "red"]) + y_pred = np.array(["blue", "red", "red", "red"]) y_true = make_data_type(data_type, y_true) y_pred = make_data_type(data_type, y_pred) conf_mat = confusion_matrix(y_true=y_true, y_predicted=y_pred) - labels = ['blue', 'red'] + labels = ["blue", "red"] assert np.array_equal(conf_mat.index, labels) assert np.array_equal(conf_mat.columns, labels) - y_true = np.array(['blue', 'red', 'red', 'red', 'orange', 'orange']) - y_pred = np.array(['red', 'blue', 'blue', 'red', 'orange', 'orange']) + y_true = np.array(["blue", "red", "red", "red", "orange", "orange"]) + y_pred = np.array(["red", "blue", "blue", "red", "orange", "orange"]) y_true = make_data_type(data_type, y_true) y_pred = make_data_type(data_type, y_pred) conf_mat = confusion_matrix(y_true=y_true, y_predicted=y_pred) - labels = ['blue', 'orange', 'red'] + labels = ["blue", "orange", "red"] assert np.array_equal(conf_mat.index, labels) assert np.array_equal(conf_mat.columns, labels) @@ -205,30 +219,27 @@ def test_precision_recall_curve_return_type(): y_true = np.array([0, 0, 1, 1]) y_predict_proba = np.array([0.1, 0.4, 0.35, 0.8]) precision_recall_curve_data = precision_recall_curve(y_true, y_predict_proba) - assert isinstance(precision_recall_curve_data['precision'], np.ndarray) - assert isinstance(precision_recall_curve_data['recall'], np.ndarray) - assert isinstance(precision_recall_curve_data['thresholds'], np.ndarray) - assert isinstance(precision_recall_curve_data['auc_score'], float) + assert isinstance(precision_recall_curve_data["precision"], np.ndarray) + assert isinstance(precision_recall_curve_data["recall"], np.ndarray) + assert isinstance(precision_recall_curve_data["thresholds"], np.ndarray) + assert isinstance(precision_recall_curve_data["auc_score"], float) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'pd2d', 'li', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "pd2d", "li", "ww"]) def test_precision_recall_curve(data_type, make_data_type): y_true = np.array([0, 0, 1, 1]) y_predict_proba = np.array([0.1, 0.4, 0.35, 0.8]) - if data_type == 'pd2d': - data_type = 'pd' - y_predict_proba = np.array([[0.9, 0.1], - [0.6, 0.4], - [0.65, 0.35], - [0.2, 0.8]]) + if data_type == "pd2d": + data_type = "pd" + y_predict_proba = np.array([[0.9, 0.1], [0.6, 0.4], [0.65, 0.35], [0.2, 0.8]]) y_true = make_data_type(data_type, y_true) y_predict_proba = make_data_type(data_type, y_predict_proba) precision_recall_curve_data = precision_recall_curve(y_true, y_predict_proba) - precision = precision_recall_curve_data.get('precision') - recall = precision_recall_curve_data.get('recall') - thresholds = precision_recall_curve_data.get('thresholds') + precision = precision_recall_curve_data.get("precision") + recall = precision_recall_curve_data.get("recall") + thresholds = precision_recall_curve_data.get("thresholds") precision_expected = np.array([0.66666667, 0.5, 1, 1]) recall_expected = np.array([1, 0.5, 0.5, 0]) @@ -241,15 +252,16 @@ def test_precision_recall_curve(data_type, make_data_type): def test_precision_recall_curve_pos_label_idx(): y_true = pd.Series(np.array([0, 0, 1, 1])) - y_predict_proba = pd.DataFrame(np.array([[0.9, 0.1], - [0.6, 0.4], - [0.65, 0.35], - [0.2, 0.8]])) - precision_recall_curve_data = precision_recall_curve(y_true, y_predict_proba, pos_label_idx=1) + y_predict_proba = pd.DataFrame( + np.array([[0.9, 0.1], [0.6, 0.4], [0.65, 0.35], [0.2, 0.8]]) + ) + precision_recall_curve_data = precision_recall_curve( + y_true, y_predict_proba, pos_label_idx=1 + ) - precision = precision_recall_curve_data.get('precision') - recall = precision_recall_curve_data.get('recall') - thresholds = precision_recall_curve_data.get('thresholds') + precision = precision_recall_curve_data.get("precision") + recall = precision_recall_curve_data.get("recall") + thresholds = precision_recall_curve_data.get("thresholds") precision_expected = np.array([0.66666667, 0.5, 1, 1]) recall_expected = np.array([1, 0.5, 0.5, 0]) @@ -258,11 +270,12 @@ def test_precision_recall_curve_pos_label_idx(): np.testing.assert_almost_equal(recall_expected, recall, decimal=5) np.testing.assert_almost_equal(thresholds_expected, thresholds, decimal=5) - y_predict_proba = pd.DataFrame(np.array([[0.1, 0.9], - [0.4, 0.6], - [0.35, 0.65], - [0.8, 0.2]])) - precision_recall_curve_data = precision_recall_curve(y_true, y_predict_proba, pos_label_idx=0) + y_predict_proba = pd.DataFrame( + np.array([[0.1, 0.9], [0.4, 0.6], [0.35, 0.65], [0.8, 0.2]]) + ) + precision_recall_curve_data = precision_recall_curve( + y_true, y_predict_proba, pos_label_idx=0 + ) np.testing.assert_almost_equal(precision_expected, precision, decimal=5) np.testing.assert_almost_equal(recall_expected, recall, decimal=5) np.testing.assert_almost_equal(thresholds_expected, thresholds, decimal=5) @@ -270,18 +283,20 @@ def test_precision_recall_curve_pos_label_idx(): def test_precision_recall_curve_pos_label_idx_error(make_data_type): y_true = np.array([0, 0, 1, 1]) - y_predict_proba = np.array([[0.9, 0.1], - [0.6, 0.4], - [0.65, 0.35], - [0.2, 0.8]]) - with pytest.raises(NoPositiveLabelException, - match="Predicted probabilities of shape \\(4, 2\\) don't contain a column at index 9001"): + y_predict_proba = np.array([[0.9, 0.1], [0.6, 0.4], [0.65, 0.35], [0.2, 0.8]]) + with pytest.raises( + NoPositiveLabelException, + match="Predicted probabilities of shape \\(4, 2\\) don't contain a column at index 9001", + ): precision_recall_curve(y_true, y_predict_proba, pos_label_idx=9001) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_graph_precision_recall_curve(X_y_binary, data_type, make_data_type): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y_true = X_y_binary rs = get_random_state(42) y_pred_proba = y_true * rs.random(y_true.shape) @@ -291,27 +306,40 @@ def test_graph_precision_recall_curve(X_y_binary, data_type, make_data_type): assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Precision-Recall' - assert len(fig_dict['data']) == 1 + assert fig_dict["layout"]["title"]["text"] == "Precision-Recall" + assert len(fig_dict["data"]) == 1 precision_recall_curve_data = precision_recall_curve(y_true, y_pred_proba) - assert np.array_equal(fig_dict['data'][0]['x'], precision_recall_curve_data['recall']) - assert np.array_equal(fig_dict['data'][0]['y'], precision_recall_curve_data['precision']) - assert fig_dict['data'][0]['name'] == 'Precision-Recall (AUC {:06f})'.format(precision_recall_curve_data['auc_score']) + assert np.array_equal( + fig_dict["data"][0]["x"], precision_recall_curve_data["recall"] + ) + assert np.array_equal( + fig_dict["data"][0]["y"], precision_recall_curve_data["precision"] + ) + assert fig_dict["data"][0]["name"] == "Precision-Recall (AUC {:06f})".format( + precision_recall_curve_data["auc_score"] + ) def test_graph_precision_recall_curve_title_addition(X_y_binary): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y_true = X_y_binary rs = get_random_state(42) y_pred_proba = y_true * rs.random(y_true.shape) - fig = graph_precision_recall_curve(y_true, y_pred_proba, title_addition='with added title text') + fig = graph_precision_recall_curve( + y_true, y_pred_proba, title_addition="with added title text" + ) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Precision-Recall with added title text' + assert ( + fig_dict["layout"]["title"]["text"] == "Precision-Recall with added title text" + ) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_roc_curve_binary(data_type, make_data_type): y_true = np.array([1, 1, 0, 0]) y_predict_proba = np.array([0.1, 0.4, 0.35, 0.8]) @@ -319,10 +347,10 @@ def test_roc_curve_binary(data_type, make_data_type): y_predict_proba = make_data_type(data_type, y_predict_proba) roc_curve_data = roc_curve(y_true, y_predict_proba)[0] - fpr_rates = roc_curve_data.get('fpr_rates') - tpr_rates = roc_curve_data.get('tpr_rates') - thresholds = roc_curve_data.get('thresholds') - auc_score = roc_curve_data.get('auc_score') + fpr_rates = roc_curve_data.get("fpr_rates") + tpr_rates = roc_curve_data.get("tpr_rates") + thresholds = roc_curve_data.get("thresholds") + auc_score = roc_curve_data.get("auc_score") fpr_expected = np.array([0, 0.5, 0.5, 1, 1]) tpr_expected = np.array([0, 0, 0.5, 0.5, 1]) thresholds_expected = np.array([1.8, 0.8, 0.4, 0.35, 0.1]) @@ -330,9 +358,9 @@ def test_roc_curve_binary(data_type, make_data_type): assert np.array_equal(tpr_expected, tpr_rates) assert np.array_equal(thresholds_expected, thresholds) assert auc_score == pytest.approx(0.25, 1e-5) - assert isinstance(roc_curve_data['fpr_rates'], np.ndarray) - assert isinstance(roc_curve_data['tpr_rates'], np.ndarray) - assert isinstance(roc_curve_data['thresholds'], np.ndarray) + assert isinstance(roc_curve_data["fpr_rates"], np.ndarray) + assert isinstance(roc_curve_data["tpr_rates"], np.ndarray) + assert isinstance(roc_curve_data["thresholds"], np.ndarray) y_true = np.array([1, 1, 0, 0]) y_predict_proba = np.array([[0.9, 0.1], [0.6, 0.4], [0.65, 0.35], [0.2, 0.8]]) @@ -340,10 +368,10 @@ def test_roc_curve_binary(data_type, make_data_type): y_true = make_data_type(data_type, y_true) roc_curve_data = roc_curve(y_true, y_predict_proba)[0] - fpr_rates = roc_curve_data.get('fpr_rates') - tpr_rates = roc_curve_data.get('tpr_rates') - thresholds = roc_curve_data.get('thresholds') - auc_score = roc_curve_data.get('auc_score') + fpr_rates = roc_curve_data.get("fpr_rates") + tpr_rates = roc_curve_data.get("tpr_rates") + thresholds = roc_curve_data.get("thresholds") + auc_score = roc_curve_data.get("auc_score") fpr_expected = np.array([0, 0.5, 0.5, 1, 1]) tpr_expected = np.array([0, 0, 0.5, 0.5, 1]) thresholds_expected = np.array([1.8, 0.8, 0.4, 0.35, 0.1]) @@ -351,56 +379,63 @@ def test_roc_curve_binary(data_type, make_data_type): assert np.array_equal(tpr_expected, tpr_rates) assert np.array_equal(thresholds_expected, thresholds) assert auc_score == pytest.approx(0.25, 1e-5) - assert isinstance(roc_curve_data['fpr_rates'], np.ndarray) - assert isinstance(roc_curve_data['tpr_rates'], np.ndarray) - assert isinstance(roc_curve_data['thresholds'], np.ndarray) + assert isinstance(roc_curve_data["fpr_rates"], np.ndarray) + assert isinstance(roc_curve_data["tpr_rates"], np.ndarray) + assert isinstance(roc_curve_data["thresholds"], np.ndarray) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_roc_curve_multiclass(data_type, make_data_type): y_true = np.array([1, 2, 0, 0, 2, 1]) - y_predict_proba = np.array([[0.33, 0.33, 0.33], - [0.05, 0.05, 0.90], - [0.75, 0.15, 0.10], - [0.8, 0.1, 0.1], - [0.1, 0.1, 0.8], - [0.3, 0.4, 0.3]]) + y_predict_proba = np.array( + [ + [0.33, 0.33, 0.33], + [0.05, 0.05, 0.90], + [0.75, 0.15, 0.10], + [0.8, 0.1, 0.1], + [0.1, 0.1, 0.8], + [0.3, 0.4, 0.3], + ] + ) y_true = make_data_type(data_type, y_true) y_predict_proba = make_data_type(data_type, y_predict_proba) roc_curve_data = roc_curve(y_true, y_predict_proba) - fpr_expected = np.array([[0, 0, 0, 1], - [0, 0, 0, 0.25, 0.75, 1], - [0, 0, 0, 0.5, 1]]) - tpr_expected = np.array([[0, 0.5, 1, 1], - [0, 0.5, 1, 1, 1, 1], - [0, 0.5, 1, 1, 1]]) - thresholds_expected = np.array([[1.8, 0.8, 0.75, 0.05], - [1.4, 0.4, 0.33, 0.15, 0.1, 0.05], - [1.9, 0.9, 0.8, 0.3, 0.1]]) + fpr_expected = np.array([[0, 0, 0, 1], [0, 0, 0, 0.25, 0.75, 1], [0, 0, 0, 0.5, 1]]) + tpr_expected = np.array([[0, 0.5, 1, 1], [0, 0.5, 1, 1, 1, 1], [0, 0.5, 1, 1, 1]]) + thresholds_expected = np.array( + [ + [1.8, 0.8, 0.75, 0.05], + [1.4, 0.4, 0.33, 0.15, 0.1, 0.05], + [1.9, 0.9, 0.8, 0.3, 0.1], + ] + ) auc_expected = [1, 1, 1] y_true_unique = y_true - if data_type == 'ww': + if data_type == "ww": y_true_unique = y_true for i in np.unique(y_true_unique): - fpr_rates = roc_curve_data[i].get('fpr_rates') - tpr_rates = roc_curve_data[i].get('tpr_rates') - thresholds = roc_curve_data[i].get('thresholds') - auc_score = roc_curve_data[i].get('auc_score') + fpr_rates = roc_curve_data[i].get("fpr_rates") + tpr_rates = roc_curve_data[i].get("tpr_rates") + thresholds = roc_curve_data[i].get("thresholds") + auc_score = roc_curve_data[i].get("auc_score") assert np.array_equal(fpr_expected[i], fpr_rates) assert np.array_equal(tpr_expected[i], tpr_rates) assert np.array_equal(thresholds_expected[i], thresholds) assert auc_expected[i] == pytest.approx(auc_score, 1e-5) - assert isinstance(roc_curve_data[i]['fpr_rates'], np.ndarray) - assert isinstance(roc_curve_data[i]['tpr_rates'], np.ndarray) - assert isinstance(roc_curve_data[i]['thresholds'], np.ndarray) + assert isinstance(roc_curve_data[i]["fpr_rates"], np.ndarray) + assert isinstance(roc_curve_data[i]["tpr_rates"], np.ndarray) + assert isinstance(roc_curve_data[i]["thresholds"], np.ndarray) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_graph_roc_curve_binary(X_y_binary, data_type, make_data_type): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y_true = X_y_binary rs = get_random_state(42) y_pred_proba = y_true * rs.random(y_true.shape) @@ -410,86 +445,119 @@ def test_graph_roc_curve_binary(X_y_binary, data_type, make_data_type): fig = graph_roc_curve(y_true, y_pred_proba) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Receiver Operating Characteristic' - assert len(fig_dict['data']) == 2 + assert fig_dict["layout"]["title"]["text"] == "Receiver Operating Characteristic" + assert len(fig_dict["data"]) == 2 roc_curve_data = roc_curve(y_true, y_pred_proba)[0] - assert np.array_equal(fig_dict['data'][0]['x'], roc_curve_data['fpr_rates']) - assert np.array_equal(fig_dict['data'][0]['y'], roc_curve_data['tpr_rates']) - assert np.array_equal(fig_dict['data'][0]['text'], roc_curve_data['thresholds']) - assert fig_dict['data'][0]['name'] == 'Class 1 (AUC {:06f})'.format(roc_curve_data['auc_score']) - assert np.array_equal(fig_dict['data'][1]['x'], np.array([0, 1])) - assert np.array_equal(fig_dict['data'][1]['y'], np.array([0, 1])) - assert fig_dict['data'][1]['name'] == 'Trivial Model (AUC 0.5)' + assert np.array_equal(fig_dict["data"][0]["x"], roc_curve_data["fpr_rates"]) + assert np.array_equal(fig_dict["data"][0]["y"], roc_curve_data["tpr_rates"]) + assert np.array_equal(fig_dict["data"][0]["text"], roc_curve_data["thresholds"]) + assert fig_dict["data"][0]["name"] == "Class 1 (AUC {:06f})".format( + roc_curve_data["auc_score"] + ) + assert np.array_equal(fig_dict["data"][1]["x"], np.array([0, 1])) + assert np.array_equal(fig_dict["data"][1]["y"], np.array([0, 1])) + assert fig_dict["data"][1]["name"] == "Trivial Model (AUC 0.5)" def test_graph_roc_curve_nans(): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) one_val_y_zero = np.array([0]) with pytest.warns(UndefinedMetricWarning): fig = graph_roc_curve(one_val_y_zero, one_val_y_zero) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert np.array_equal(fig_dict['data'][0]['x'], np.array([0., 1.])) - assert np.allclose(fig_dict['data'][0]['y'], np.array([np.nan, np.nan]), equal_nan=True) - fig1 = graph_roc_curve(np.array([np.nan, 1, 1, 0, 1]), np.array([0, 0, 0.5, 0.1, 0.9])) - fig2 = graph_roc_curve(np.array([1, 0, 1, 0, 1]), np.array([0, np.nan, 0.5, 0.1, 0.9])) + assert np.array_equal(fig_dict["data"][0]["x"], np.array([0.0, 1.0])) + assert np.allclose( + fig_dict["data"][0]["y"], np.array([np.nan, np.nan]), equal_nan=True + ) + fig1 = graph_roc_curve( + np.array([np.nan, 1, 1, 0, 1]), np.array([0, 0, 0.5, 0.1, 0.9]) + ) + fig2 = graph_roc_curve( + np.array([1, 0, 1, 0, 1]), np.array([0, np.nan, 0.5, 0.1, 0.9]) + ) assert fig1 == fig2 def test_graph_roc_curve_multiclass(binarized_ys): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) y_true, y_tr, y_pred_proba = binarized_ys fig = graph_roc_curve(y_true, y_pred_proba) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Receiver Operating Characteristic' - assert len(fig_dict['data']) == 4 + assert fig_dict["layout"]["title"]["text"] == "Receiver Operating Characteristic" + assert len(fig_dict["data"]) == 4 for i in range(3): roc_curve_data = roc_curve(y_tr[:, i], y_pred_proba[:, i])[0] - assert np.array_equal(fig_dict['data'][i]['x'], roc_curve_data['fpr_rates']) - assert np.array_equal(fig_dict['data'][i]['y'], roc_curve_data['tpr_rates']) - assert np.array_equal(fig_dict['data'][i]['text'], roc_curve_data['thresholds']) - assert fig_dict['data'][i]['name'] == 'Class {name} (AUC {:06f})'.format(roc_curve_data['auc_score'], name=i + 1) - assert np.array_equal(fig_dict['data'][3]['x'], np.array([0, 1])) - assert np.array_equal(fig_dict['data'][3]['y'], np.array([0, 1])) - assert fig_dict['data'][3]['name'] == 'Trivial Model (AUC 0.5)' - - with pytest.raises(ValueError, match='Number of custom class names does not match number of classes'): - graph_roc_curve(y_true, y_pred_proba, custom_class_names=['one', 'two']) + assert np.array_equal(fig_dict["data"][i]["x"], roc_curve_data["fpr_rates"]) + assert np.array_equal(fig_dict["data"][i]["y"], roc_curve_data["tpr_rates"]) + assert np.array_equal(fig_dict["data"][i]["text"], roc_curve_data["thresholds"]) + assert fig_dict["data"][i]["name"] == "Class {name} (AUC {:06f})".format( + roc_curve_data["auc_score"], name=i + 1 + ) + assert np.array_equal(fig_dict["data"][3]["x"], np.array([0, 1])) + assert np.array_equal(fig_dict["data"][3]["y"], np.array([0, 1])) + assert fig_dict["data"][3]["name"] == "Trivial Model (AUC 0.5)" + + with pytest.raises( + ValueError, + match="Number of custom class names does not match number of classes", + ): + graph_roc_curve(y_true, y_pred_proba, custom_class_names=["one", "two"]) def test_graph_roc_curve_multiclass_custom_class_names(binarized_ys): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) y_true, y_tr, y_pred_proba = binarized_ys - custom_class_names = ['one', 'two', 'three'] + custom_class_names = ["one", "two", "three"] fig = graph_roc_curve(y_true, y_pred_proba, custom_class_names=custom_class_names) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Receiver Operating Characteristic' + assert fig_dict["layout"]["title"]["text"] == "Receiver Operating Characteristic" for i in range(3): roc_curve_data = roc_curve(y_tr[:, i], y_pred_proba[:, i])[0] - assert np.array_equal(fig_dict['data'][i]['x'], roc_curve_data['fpr_rates']) - assert np.array_equal(fig_dict['data'][i]['y'], roc_curve_data['tpr_rates']) - assert fig_dict['data'][i]['name'] == 'Class {name} (AUC {:06f})'.format(roc_curve_data['auc_score'], name=custom_class_names[i]) - assert np.array_equal(fig_dict['data'][3]['x'], np.array([0, 1])) - assert np.array_equal(fig_dict['data'][3]['y'], np.array([0, 1])) - assert fig_dict['data'][3]['name'] == 'Trivial Model (AUC 0.5)' + assert np.array_equal(fig_dict["data"][i]["x"], roc_curve_data["fpr_rates"]) + assert np.array_equal(fig_dict["data"][i]["y"], roc_curve_data["tpr_rates"]) + assert fig_dict["data"][i]["name"] == "Class {name} (AUC {:06f})".format( + roc_curve_data["auc_score"], name=custom_class_names[i] + ) + assert np.array_equal(fig_dict["data"][3]["x"], np.array([0, 1])) + assert np.array_equal(fig_dict["data"][3]["y"], np.array([0, 1])) + assert fig_dict["data"][3]["name"] == "Trivial Model (AUC 0.5)" def test_graph_roc_curve_title_addition(X_y_binary): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y_true = X_y_binary rs = get_random_state(42) y_pred_proba = y_true * rs.random(y_true.shape) - fig = graph_roc_curve(y_true, y_pred_proba, title_addition='with added title text') + fig = graph_roc_curve(y_true, y_pred_proba, title_addition="with added title text") assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Receiver Operating Characteristic with added title text' + assert ( + fig_dict["layout"]["title"]["text"] + == "Receiver Operating Characteristic with added title text" + ) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_graph_confusion_matrix_default(X_y_binary, data_type, make_data_type): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y_true = X_y_binary rs = get_random_state(42) y_pred = np.round(y_true * rs.random(y_true.shape)).astype(int) @@ -499,126 +567,187 @@ def test_graph_confusion_matrix_default(X_y_binary, data_type, make_data_type): fig = graph_confusion_matrix(y_true, y_pred) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Confusion matrix, normalized using method "true"' - assert fig_dict['layout']['xaxis']['title']['text'] == 'Predicted Label' - assert np.all(fig_dict['layout']['xaxis']['tickvals'] == np.array([0, 1])) - assert fig_dict['layout']['yaxis']['title']['text'] == 'True Label' - assert np.all(fig_dict['layout']['yaxis']['tickvals'] == np.array([0, 1])) - assert fig_dict['layout']['yaxis']['autorange'] == 'reversed' - heatmap = fig_dict['data'][0] - conf_mat = confusion_matrix(y_true, y_pred, normalize_method='true') + assert ( + fig_dict["layout"]["title"]["text"] + == 'Confusion matrix, normalized using method "true"' + ) + assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Predicted Label" + assert np.all(fig_dict["layout"]["xaxis"]["tickvals"] == np.array([0, 1])) + assert fig_dict["layout"]["yaxis"]["title"]["text"] == "True Label" + assert np.all(fig_dict["layout"]["yaxis"]["tickvals"] == np.array([0, 1])) + assert fig_dict["layout"]["yaxis"]["autorange"] == "reversed" + heatmap = fig_dict["data"][0] + conf_mat = confusion_matrix(y_true, y_pred, normalize_method="true") conf_mat_unnormalized = confusion_matrix(y_true, y_pred, normalize_method=None) - assert np.array_equal(heatmap['x'], conf_mat.columns) - assert np.array_equal(heatmap['y'], conf_mat.columns) - assert np.array_equal(heatmap['z'], conf_mat) - assert np.array_equal(heatmap['customdata'], conf_mat_unnormalized) - assert heatmap['hovertemplate'] == 'True: %{y}
Predicted: %{x}
Normalized Count: %{z}
Raw Count: %{customdata}
' - annotations = fig.__dict__['_layout_obj']['annotations'] + assert np.array_equal(heatmap["x"], conf_mat.columns) + assert np.array_equal(heatmap["y"], conf_mat.columns) + assert np.array_equal(heatmap["z"], conf_mat) + assert np.array_equal(heatmap["customdata"], conf_mat_unnormalized) + assert ( + heatmap["hovertemplate"] + == "True: %{y}
Predicted: %{x}
Normalized Count: %{z}
Raw Count: %{customdata}
" + ) + annotations = fig.__dict__["_layout_obj"]["annotations"] # check that the figure has text annotations for the confusion matrix for i in range(len(annotations)): - assert 'text' in annotations[i] + assert "text" in annotations[i] def test_graph_confusion_matrix_norm_disabled(X_y_binary): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y_true = X_y_binary rs = get_random_state(42) y_pred = np.round(y_true * rs.random(y_true.shape)).astype(int) fig = graph_confusion_matrix(y_true, y_pred, normalize_method=None) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Confusion matrix' - assert fig_dict['layout']['xaxis']['title']['text'] == 'Predicted Label' - assert np.all(fig_dict['layout']['xaxis']['tickvals'] == np.array([0, 1])) - assert fig_dict['layout']['yaxis']['title']['text'] == 'True Label' - assert np.all(fig_dict['layout']['yaxis']['tickvals'] == np.array([0, 1])) - assert fig_dict['layout']['yaxis']['autorange'] == 'reversed' - heatmap = fig_dict['data'][0] + assert fig_dict["layout"]["title"]["text"] == "Confusion matrix" + assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Predicted Label" + assert np.all(fig_dict["layout"]["xaxis"]["tickvals"] == np.array([0, 1])) + assert fig_dict["layout"]["yaxis"]["title"]["text"] == "True Label" + assert np.all(fig_dict["layout"]["yaxis"]["tickvals"] == np.array([0, 1])) + assert fig_dict["layout"]["yaxis"]["autorange"] == "reversed" + heatmap = fig_dict["data"][0] conf_mat = confusion_matrix(y_true, y_pred, normalize_method=None) - conf_mat_normalized = confusion_matrix(y_true, y_pred, normalize_method='true') - assert np.array_equal(heatmap['x'], conf_mat.columns) - assert np.array_equal(heatmap['y'], conf_mat.columns) - assert np.array_equal(heatmap['z'], conf_mat) - assert np.array_equal(heatmap['customdata'], conf_mat_normalized) - assert heatmap['hovertemplate'] == 'True: %{y}
Predicted: %{x}
Raw Count: %{z}
Normalized Count: %{customdata}
' + conf_mat_normalized = confusion_matrix(y_true, y_pred, normalize_method="true") + assert np.array_equal(heatmap["x"], conf_mat.columns) + assert np.array_equal(heatmap["y"], conf_mat.columns) + assert np.array_equal(heatmap["z"], conf_mat) + assert np.array_equal(heatmap["customdata"], conf_mat_normalized) + assert ( + heatmap["hovertemplate"] + == "True: %{y}
Predicted: %{x}
Raw Count: %{z}
Normalized Count: %{customdata}
" + ) def test_graph_confusion_matrix_title_addition(X_y_binary): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y_true = X_y_binary rs = get_random_state(42) y_pred = np.round(y_true * rs.random(y_true.shape)).astype(int) - fig = graph_confusion_matrix(y_true, y_pred, title_addition='with added title text') + fig = graph_confusion_matrix(y_true, y_pred, title_addition="with added title text") assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Confusion matrix with added title text, normalized using method "true"' + assert ( + fig_dict["layout"]["title"]["text"] + == 'Confusion matrix with added title text, normalized using method "true"' + ) def test_graph_permutation_importance(X_y_binary, test_pipeline): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary clf = test_pipeline clf.fit(X, y) fig = graph_permutation_importance(test_pipeline, X, y, "Log Loss Binary") assert isinstance(fig, go.Figure) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == "Permutation Importance
"\ - "The relative importance of each input feature's overall "\ - "influence on the pipelines' predictions, computed using the "\ - "permutation importance algorithm." - assert len(fig_dict['data']) == 1 - - perm_importance_data = calculate_permutation_importance(clf, X, y, "Log Loss Binary") - assert np.array_equal(fig_dict['data'][0]['x'][::-1], perm_importance_data['importance'].values) - assert np.array_equal(fig_dict['data'][0]['y'][::-1], perm_importance_data['feature']) - - -@patch('evalml.model_understanding.graphs.calculate_permutation_importance') + assert ( + fig_dict["layout"]["title"]["text"] == "Permutation Importance
" + "The relative importance of each input feature's overall " + "influence on the pipelines' predictions, computed using the " + "permutation importance algorithm." + ) + assert len(fig_dict["data"]) == 1 + + perm_importance_data = calculate_permutation_importance( + clf, X, y, "Log Loss Binary" + ) + assert np.array_equal( + fig_dict["data"][0]["x"][::-1], perm_importance_data["importance"].values + ) + assert np.array_equal( + fig_dict["data"][0]["y"][::-1], perm_importance_data["feature"] + ) + + +@patch("evalml.model_understanding.graphs.calculate_permutation_importance") def test_graph_permutation_importance_show_all_features(mock_perm_importance): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') - mock_perm_importance.return_value = pd.DataFrame({"feature": ["f1", "f2"], "importance": [0.0, 0.6]}) - - figure = graph_permutation_importance(test_pipeline, pd.DataFrame(), pd.Series(), "Log Loss Binary") + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) + mock_perm_importance.return_value = pd.DataFrame( + {"feature": ["f1", "f2"], "importance": [0.0, 0.6]} + ) + + figure = graph_permutation_importance( + test_pipeline, pd.DataFrame(), pd.Series(), "Log Loss Binary" + ) assert isinstance(figure, go.Figure) data = figure.data[0] - assert (np.any(data['x'] == 0.0)) + assert np.any(data["x"] == 0.0) -@patch('evalml.model_understanding.graphs.calculate_permutation_importance') +@patch("evalml.model_understanding.graphs.calculate_permutation_importance") def test_graph_permutation_importance_threshold(mock_perm_importance): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') - mock_perm_importance.return_value = pd.DataFrame({"feature": ["f1", "f2"], "importance": [0.0, 0.6]}) - - with pytest.raises(ValueError, match="Provided importance threshold of -0.1 must be greater than or equal to 0"): - fig = graph_permutation_importance(test_pipeline, pd.DataFrame(), pd.Series(), "Log Loss Binary", importance_threshold=-0.1) - fig = graph_permutation_importance(test_pipeline, pd.DataFrame(), pd.Series(), "Log Loss Binary", importance_threshold=0.5) + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) + mock_perm_importance.return_value = pd.DataFrame( + {"feature": ["f1", "f2"], "importance": [0.0, 0.6]} + ) + + with pytest.raises( + ValueError, + match="Provided importance threshold of -0.1 must be greater than or equal to 0", + ): + fig = graph_permutation_importance( + test_pipeline, + pd.DataFrame(), + pd.Series(), + "Log Loss Binary", + importance_threshold=-0.1, + ) + fig = graph_permutation_importance( + test_pipeline, + pd.DataFrame(), + pd.Series(), + "Log Loss Binary", + importance_threshold=0.5, + ) assert isinstance(fig, go.Figure) data = fig.data[0] - assert (np.all(data['x'] >= 0.5)) + assert np.all(data["x"] >= 0.5) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) -def test_cost_benefit_matrix_vs_threshold(data_type, X_y_binary, logistic_regression_binary_pipeline_class, make_data_type): +def test_cost_benefit_matrix_vs_threshold( + data_type, X_y_binary, logistic_regression_binary_pipeline_class, make_data_type +): X, y = X_y_binary X = make_data_type(data_type, X) y = make_data_type(data_type, y) - cbm = CostBenefitMatrix(true_positive=1, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=1, true_negative=-1, false_positive=-7, false_negative=-2 + ) pipeline = logistic_regression_binary_pipeline_class(parameters={}) pipeline.fit(X, y) original_pipeline_threshold = pipeline.threshold cost_benefit_df = binary_objective_vs_threshold(pipeline, X, y, cbm) - assert list(cost_benefit_df.columns) == ['threshold', 'score'] + assert list(cost_benefit_df.columns) == ["threshold", "score"] assert cost_benefit_df.shape == (101, 2) assert not cost_benefit_df.isnull().all().all() assert pipeline.threshold == original_pipeline_threshold @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) -def test_binary_objective_vs_threshold(data_type, X_y_binary, logistic_regression_binary_pipeline_class, make_data_type): +def test_binary_objective_vs_threshold( + data_type, X_y_binary, logistic_regression_binary_pipeline_class, make_data_type +): X, y = X_y_binary X = make_data_type(data_type, X) y = make_data_type(data_type, y) @@ -628,66 +757,88 @@ def test_binary_objective_vs_threshold(data_type, X_y_binary, logistic_regressio # test objective with score_needs_proba == True with pytest.raises(ValueError, match="Objective `score_needs_proba` must be False"): - binary_objective_vs_threshold(pipeline, X, y, 'Log Loss Binary') + binary_objective_vs_threshold(pipeline, X, y, "Log Loss Binary") # test with non-binary objective - with pytest.raises(ValueError, match="can only be calculated for binary classification objectives"): - binary_objective_vs_threshold(pipeline, X, y, 'f1 micro') + with pytest.raises( + ValueError, match="can only be calculated for binary classification objectives" + ): + binary_objective_vs_threshold(pipeline, X, y, "f1 micro") # test objective with score_needs_proba == False - results_df = binary_objective_vs_threshold(pipeline, X, y, 'f1') - assert list(results_df.columns) == ['threshold', 'score'] + results_df = binary_objective_vs_threshold(pipeline, X, y, "f1") + assert list(results_df.columns) == ["threshold", "score"] assert results_df.shape == (101, 2) assert not results_df.isnull().all().all() -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -def test_binary_objective_vs_threshold_steps(mock_score, - X_y_binary, logistic_regression_binary_pipeline_class): +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +def test_binary_objective_vs_threshold_steps( + mock_score, X_y_binary, logistic_regression_binary_pipeline_class +): X, y = X_y_binary - cbm = CostBenefitMatrix(true_positive=1, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=1, true_negative=-1, false_positive=-7, false_negative=-2 + ) pipeline = logistic_regression_binary_pipeline_class(parameters={}) pipeline.fit(X, y) mock_score.return_value = {"Cost Benefit Matrix": 0.2} cost_benefit_df = binary_objective_vs_threshold(pipeline, X, y, cbm, steps=234) mock_score.assert_called() - assert list(cost_benefit_df.columns) == ['threshold', 'score'] + assert list(cost_benefit_df.columns) == ["threshold", "score"] assert cost_benefit_df.shape == (235, 2) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) -@patch('evalml.model_understanding.graphs.binary_objective_vs_threshold') -def test_graph_binary_objective_vs_threshold(mock_cb_thresholds, data_type, X_y_binary, logistic_regression_binary_pipeline_class, make_data_type): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) +@patch("evalml.model_understanding.graphs.binary_objective_vs_threshold") +def test_graph_binary_objective_vs_threshold( + mock_cb_thresholds, + data_type, + X_y_binary, + logistic_regression_binary_pipeline_class, + make_data_type, +): + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary X = make_data_type(data_type, X) y = make_data_type(data_type, y) pipeline = logistic_regression_binary_pipeline_class(parameters={}) - cbm = CostBenefitMatrix(true_positive=1, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=1, true_negative=-1, false_positive=-7, false_negative=-2 + ) - mock_cb_thresholds.return_value = pd.DataFrame({'threshold': [0, 0.5, 1.0], - 'score': [100, -20, 5]}) + mock_cb_thresholds.return_value = pd.DataFrame( + {"threshold": [0, 0.5, 1.0], "score": [100, -20, 5]} + ) figure = graph_binary_objective_vs_threshold(pipeline, X, y, cbm) assert isinstance(figure, go.Figure) data = figure.data[0] - assert not np.any(np.isnan(data['x'])) - assert not np.any(np.isnan(data['y'])) - assert np.array_equal(data['x'], mock_cb_thresholds.return_value['threshold']) - assert np.array_equal(data['y'], mock_cb_thresholds.return_value['score']) - - -@patch('evalml.model_understanding.graphs.jupyter_check') -@patch('evalml.model_understanding.graphs.import_or_raise') -def test_jupyter_graph_check(import_check, jupyter_check, X_y_binary, X_y_regression, test_pipeline): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + assert not np.any(np.isnan(data["x"])) + assert not np.any(np.isnan(data["y"])) + assert np.array_equal(data["x"], mock_cb_thresholds.return_value["threshold"]) + assert np.array_equal(data["y"], mock_cb_thresholds.return_value["score"]) + + +@patch("evalml.model_understanding.graphs.jupyter_check") +@patch("evalml.model_understanding.graphs.import_or_raise") +def test_jupyter_graph_check( + import_check, jupyter_check, X_y_binary, X_y_regression, test_pipeline +): + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary clf = test_pipeline clf.fit(X, y) - cbm = CostBenefitMatrix(true_positive=1, true_negative=-1, false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=1, true_negative=-1, false_positive=-7, false_negative=-2 + ) jupyter_check.return_value = False with pytest.warns(None) as graph_valid: graph_permutation_importance(test_pipeline, X, y, "log loss binary") @@ -700,31 +851,31 @@ def test_jupyter_graph_check(import_check, jupyter_check, X_y_binary, X_y_regres with pytest.warns(None) as graph_valid: graph_partial_dependence(clf, X, features=0, grid_resolution=20) assert len(graph_valid) == 1 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) with pytest.warns(None) as graph_valid: graph_binary_objective_vs_threshold(test_pipeline, X, y, cbm) assert len(graph_valid) == 0 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) with pytest.warns(None) as graph_valid: rs = get_random_state(42) y_pred_proba = y * rs.random(y.shape) graph_precision_recall_curve(y, y_pred_proba) assert len(graph_valid) == 0 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) with pytest.warns(None) as graph_valid: graph_permutation_importance(test_pipeline, X, y, "log loss binary") assert len(graph_valid) == 0 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) with pytest.warns(None) as graph_valid: graph_confusion_matrix(y, y) assert len(graph_valid) == 0 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) with pytest.warns(None) as graph_valid: rs = get_random_state(42) y_pred_proba = y * rs.random(y.shape) graph_roc_curve(y, y_pred_proba) assert len(graph_valid) == 0 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) Xr, yr = X_y_regression with pytest.warns(None) as graph_valid: @@ -732,7 +883,7 @@ def test_jupyter_graph_check(import_check, jupyter_check, X_y_binary, X_y_regres y_preds = yr * rs.random(yr.shape) graph_prediction_vs_actual(yr, y_preds) assert len(graph_valid) == 0 - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) @@ -747,45 +898,55 @@ def test_get_prediction_vs_actual_data(data_type, make_data_type): get_prediction_vs_actual_data(y_true_in, y_pred_in, outlier_threshold=-1) outlier_loc = [2, 11] - results = get_prediction_vs_actual_data(y_true_in, y_pred_in, outlier_threshold=2000) + results = get_prediction_vs_actual_data( + y_true_in, y_pred_in, outlier_threshold=2000 + ) assert isinstance(results, pd.DataFrame) - assert np.array_equal(results['prediction'], y_pred) - assert np.array_equal(results['actual'], y_true) - for i, value in enumerate(results['outlier']): + assert np.array_equal(results["prediction"], y_pred) + assert np.array_equal(results["actual"], y_true) + for i, value in enumerate(results["outlier"]): if i in outlier_loc: assert value == "#ffff00" else: - assert value == '#0000ff' + assert value == "#0000ff" results = get_prediction_vs_actual_data(y_true_in, y_pred_in) assert isinstance(results, pd.DataFrame) - assert np.array_equal(results['prediction'], y_pred) - assert np.array_equal(results['actual'], y_true) - assert (results['outlier'] == '#0000ff').all() + assert np.array_equal(results["prediction"], y_pred) + assert np.array_equal(results["actual"], y_true) + assert (results["outlier"] == "#0000ff").all() def test_graph_prediction_vs_actual_default(): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) y_true = [1, 2, 3000, 4, 5, 6, 7, 8, 9, 10, 11, 12] y_pred = [5, 4, 2, 8, 6, 6, 5, 1, 7, 2, 1, 3000] fig = graph_prediction_vs_actual(y_true, y_pred) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Predicted vs Actual Values Scatter Plot' - assert fig_dict['layout']['xaxis']['title']['text'] == 'Prediction' - assert fig_dict['layout']['yaxis']['title']['text'] == 'Actual' - assert len(fig_dict['data']) == 2 - assert fig_dict['data'][0]['name'] == 'y = x line' - assert fig_dict['data'][0]['x'] == fig_dict['data'][0]['y'] - assert len(fig_dict['data'][1]['x']) == len(y_true) - assert fig_dict['data'][1]['marker']['color'] == '#0000ff' - assert fig_dict['data'][1]['name'] == "Values" + assert ( + fig_dict["layout"]["title"]["text"] == "Predicted vs Actual Values Scatter Plot" + ) + assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Prediction" + assert fig_dict["layout"]["yaxis"]["title"]["text"] == "Actual" + assert len(fig_dict["data"]) == 2 + assert fig_dict["data"][0]["name"] == "y = x line" + assert fig_dict["data"][0]["x"] == fig_dict["data"][0]["y"] + assert len(fig_dict["data"][1]["x"]) == len(y_true) + assert fig_dict["data"][1]["marker"]["color"] == "#0000ff" + assert fig_dict["data"][1]["name"] == "Values" @pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_graph_prediction_vs_actual(data_type): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) y_true = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] y_pred = [5, 4, 3, 8, 6, 3, 5, 9, 7, 12, 1, 2] @@ -795,11 +956,13 @@ def test_graph_prediction_vs_actual(data_type): fig = graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=100) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Predicted vs Actual Values Scatter Plot' - assert fig_dict['layout']['xaxis']['title']['text'] == 'Prediction' - assert fig_dict['layout']['yaxis']['title']['text'] == 'Actual' - assert len(fig_dict['data']) == 2 - assert fig_dict['data'][1]['marker']['color'] == '#0000ff' + assert ( + fig_dict["layout"]["title"]["text"] == "Predicted vs Actual Values Scatter Plot" + ) + assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Prediction" + assert fig_dict["layout"]["yaxis"]["title"]["text"] == "Actual" + assert len(fig_dict["data"]) == 2 + assert fig_dict["data"][1]["marker"]["color"] == "#0000ff" y_true = pd.Series(y_true) y_pred = pd.Series(y_pred) @@ -809,27 +972,33 @@ def test_graph_prediction_vs_actual(data_type): fig = graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=6.1) assert isinstance(fig, type(go.Figure())) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Predicted vs Actual Values Scatter Plot' - assert fig_dict['layout']['xaxis']['title']['text'] == 'Prediction' - assert fig_dict['layout']['yaxis']['title']['text'] == 'Actual' - assert len(fig_dict['data']) == 3 - assert fig_dict['data'][1]['marker']['color'] == '#0000ff' - assert len(fig_dict['data'][1]['x']) == 10 - assert len(fig_dict['data'][1]['y']) == 10 - assert fig_dict['data'][1]['name'] == "< outlier_threshold" - assert fig_dict['data'][2]['marker']['color'] == '#ffff00' - assert len(fig_dict['data'][2]['x']) == 2 - assert len(fig_dict['data'][2]['y']) == 2 - assert fig_dict['data'][2]['name'] == ">= outlier_threshold" - - -@patch('evalml.pipelines.ClassificationPipeline.predict') -@pytest.mark.parametrize("data_type", ['pd', 'ww']) -def test_get_prediction_vs_actual_over_time_data(mock_predict, data_type, logistic_regression_binary_pipeline_class, make_data_type): + assert ( + fig_dict["layout"]["title"]["text"] == "Predicted vs Actual Values Scatter Plot" + ) + assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Prediction" + assert fig_dict["layout"]["yaxis"]["title"]["text"] == "Actual" + assert len(fig_dict["data"]) == 3 + assert fig_dict["data"][1]["marker"]["color"] == "#0000ff" + assert len(fig_dict["data"][1]["x"]) == 10 + assert len(fig_dict["data"][1]["y"]) == 10 + assert fig_dict["data"][1]["name"] == "< outlier_threshold" + assert fig_dict["data"][2]["marker"]["color"] == "#ffff00" + assert len(fig_dict["data"][2]["x"]) == 2 + assert len(fig_dict["data"][2]["y"]) == 2 + assert fig_dict["data"][2]["name"] == ">= outlier_threshold" + + +@patch("evalml.pipelines.ClassificationPipeline.predict") +@pytest.mark.parametrize("data_type", ["pd", "ww"]) +def test_get_prediction_vs_actual_over_time_data( + mock_predict, data_type, logistic_regression_binary_pipeline_class, make_data_type +): mock_predict.return_value = pd.Series([0] * 20) X = make_data_type(data_type, pd.DataFrame()) y = make_data_type(data_type, pd.Series([0] * 20)) - dates = make_data_type(data_type, pd.Series(pd.date_range('2000-05-19', periods=20, freq='D'))) + dates = make_data_type( + data_type, pd.Series(pd.date_range("2000-05-19", periods=20, freq="D")) + ) pipeline = logistic_regression_binary_pipeline_class(parameters={}) results = get_prediction_vs_actual_over_time_data(pipeline, X, y, dates) @@ -838,9 +1007,12 @@ def test_get_prediction_vs_actual_over_time_data(mock_predict, data_type, logist def test_graph_prediction_vs_actual_over_time(): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) - class MockPipeline(): + class MockPipeline: problem_type = ProblemTypes.TIME_SERIES_REGRESSION def predict(self, X, y): @@ -854,26 +1026,33 @@ def predict(self, X, y): pipeline = MockPipeline() # For this test it doesn't matter what the features are - fig = graph_prediction_vs_actual_over_time(pipeline, X=pd.DataFrame(), y=y, dates=dates) + fig = graph_prediction_vs_actual_over_time( + pipeline, X=pd.DataFrame(), y=y, dates=dates + ) assert isinstance(fig, go.Figure) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == 'Prediction vs Target over time' - assert fig_dict['layout']['xaxis']['title']['text'] == 'Time' - assert fig_dict['layout']['yaxis']['title']['text'] == 'Target Values and Predictions' - assert len(fig_dict['data']) == 2 - assert fig_dict['data'][0]['line']['color'] == '#1f77b4' - assert len(fig_dict['data'][0]['x']) == 61 - assert not np.isnan(fig_dict['data'][0]['y']).all() - assert len(fig_dict['data'][0]['y']) == 61 - assert fig_dict['data'][1]['line']['color'] == '#d62728' - assert len(fig_dict['data'][1]['x']) == 61 - assert len(fig_dict['data'][1]['y']) == 61 - assert not np.isnan(fig_dict['data'][1]['y']).all() + assert fig_dict["layout"]["title"]["text"] == "Prediction vs Target over time" + assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Time" + assert ( + fig_dict["layout"]["yaxis"]["title"]["text"] == "Target Values and Predictions" + ) + assert len(fig_dict["data"]) == 2 + assert fig_dict["data"][0]["line"]["color"] == "#1f77b4" + assert len(fig_dict["data"][0]["x"]) == 61 + assert not np.isnan(fig_dict["data"][0]["y"]).all() + assert len(fig_dict["data"][0]["y"]) == 61 + assert fig_dict["data"][1]["line"]["color"] == "#d62728" + assert len(fig_dict["data"][1]["x"]) == 61 + assert len(fig_dict["data"][1]["y"]) == 61 + assert not np.isnan(fig_dict["data"][1]["y"]).all() def test_graph_prediction_vs_actual_over_time_value_error(): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) class NotTSPipeline: problem_type = ProblemTypes.REGRESSION @@ -885,14 +1064,20 @@ class NotTSPipeline: def test_decision_tree_data_from_estimator_not_fitted(tree_estimators): est_class, _ = tree_estimators - with pytest.raises(NotFittedError, match="This DecisionTree estimator is not fitted yet. Call 'fit' with " - "appropriate arguments before using this estimator."): + with pytest.raises( + NotFittedError, + match="This DecisionTree estimator is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator.", + ): decision_tree_data_from_estimator(est_class) def test_decision_tree_data_from_estimator_wrong_type(logit_estimator): est_logit = logit_estimator - with pytest.raises(ValueError, match="Tree structure reformatting is only supported for decision tree estimators"): + with pytest.raises( + ValueError, + match="Tree structure reformatting is only supported for decision tree estimators", + ): decision_tree_data_from_estimator(est_logit) @@ -903,46 +1088,77 @@ def test_decision_tree_data_from_estimator(fitted_tree_estimators): tree_ = est_reg._component_obj.tree_ assert isinstance(formatted_, OrderedDict) - assert formatted_['Feature'] == f'Testing_{tree_.feature[0]}' - assert formatted_['Threshold'] == tree_.threshold[0] - assert all([a == b for a, b in zip(formatted_['Value'][0], tree_.value[0][0])]) - left_child_feature_ = formatted_['Left_Child']['Feature'] - right_child_feature_ = formatted_['Right_Child']['Feature'] - left_child_threshold_ = formatted_['Left_Child']['Threshold'] - right_child_threshold_ = formatted_['Right_Child']['Threshold'] - left_child_value_ = formatted_['Left_Child']['Value'] - right_child_value_ = formatted_['Right_Child']['Value'] - assert left_child_feature_ == f'Testing_{tree_.feature[tree_.children_left[0]]}' - assert right_child_feature_ == f'Testing_{tree_.feature[tree_.children_right[0]]}' + assert formatted_["Feature"] == f"Testing_{tree_.feature[0]}" + assert formatted_["Threshold"] == tree_.threshold[0] + assert all([a == b for a, b in zip(formatted_["Value"][0], tree_.value[0][0])]) + left_child_feature_ = formatted_["Left_Child"]["Feature"] + right_child_feature_ = formatted_["Right_Child"]["Feature"] + left_child_threshold_ = formatted_["Left_Child"]["Threshold"] + right_child_threshold_ = formatted_["Right_Child"]["Threshold"] + left_child_value_ = formatted_["Left_Child"]["Value"] + right_child_value_ = formatted_["Right_Child"]["Value"] + assert left_child_feature_ == f"Testing_{tree_.feature[tree_.children_left[0]]}" + assert right_child_feature_ == f"Testing_{tree_.feature[tree_.children_right[0]]}" assert left_child_threshold_ == tree_.threshold[tree_.children_left[0]] assert right_child_threshold_ == tree_.threshold[tree_.children_right[0]] # Check that the immediate left and right child of the root node have the correct values - assert all([a == b for a, b in zip(left_child_value_[0], tree_.value[tree_.children_left[0]][0])]) - assert all([a == b for a, b in zip(right_child_value_[0], tree_.value[tree_.children_right[0]][0])]) + assert all( + [ + a == b + for a, b in zip( + left_child_value_[0], tree_.value[tree_.children_left[0]][0] + ) + ] + ) + assert all( + [ + a == b + for a, b in zip( + right_child_value_[0], tree_.value[tree_.children_right[0]][0] + ) + ] + ) def test_decision_tree_data_from_pipeline_not_fitted(): - mock_pipeline = MulticlassClassificationPipeline(component_graph=['Decision Tree Classifier']) - with pytest.raises(NotFittedError, match="The DecisionTree estimator associated with this pipeline is not fitted yet. " - "Call 'fit' with appropriate arguments before using this estimator."): + mock_pipeline = MulticlassClassificationPipeline( + component_graph=["Decision Tree Classifier"] + ) + with pytest.raises( + NotFittedError, + match="The DecisionTree estimator associated with this pipeline is not fitted yet. " + "Call 'fit' with appropriate arguments before using this estimator.", + ): decision_tree_data_from_pipeline(mock_pipeline) def test_decision_tree_data_from_pipeline_wrong_type(): - mock_pipeline = MulticlassClassificationPipeline(component_graph=['Logistic Regression Classifier']) - with pytest.raises(ValueError, match="Tree structure reformatting is only supported for decision tree estimators"): + mock_pipeline = MulticlassClassificationPipeline( + component_graph=["Logistic Regression Classifier"] + ) + with pytest.raises( + ValueError, + match="Tree structure reformatting is only supported for decision tree estimators", + ): decision_tree_data_from_pipeline(mock_pipeline) def test_decision_tree_data_from_pipeline_feature_length(X_y_categorical_regression): - mock_pipeline = RegressionPipeline(component_graph=['One Hot Encoder', 'Imputer', 'Decision Tree Regressor']) + mock_pipeline = RegressionPipeline( + component_graph=["One Hot Encoder", "Imputer", "Decision Tree Regressor"] + ) X, y = X_y_categorical_regression mock_pipeline.fit(X, y) - assert len(mock_pipeline.input_feature_names[mock_pipeline.estimator.name]) == mock_pipeline.estimator._component_obj.n_features_ + assert ( + len(mock_pipeline.input_feature_names[mock_pipeline.estimator.name]) + == mock_pipeline.estimator._component_obj.n_features_ + ) def test_decision_tree_data_from_pipeline(X_y_categorical_regression): - mock_pipeline = RegressionPipeline(component_graph=['One Hot Encoder', 'Imputer', 'Decision Tree Regressor']) + mock_pipeline = RegressionPipeline( + component_graph=["One Hot Encoder", "Imputer", "Decision Tree Regressor"] + ) X, y = X_y_categorical_regression mock_pipeline.fit(X, y) formatted_ = decision_tree_data_from_pipeline(mock_pipeline) @@ -950,94 +1166,131 @@ def test_decision_tree_data_from_pipeline(X_y_categorical_regression): feature_names = mock_pipeline.input_feature_names[mock_pipeline.estimator.name] assert isinstance(formatted_, OrderedDict) - assert formatted_['Feature'] == feature_names[tree_.feature[0]] - assert formatted_['Threshold'] == tree_.threshold[0] - assert all([a == b for a, b in zip(formatted_['Value'][0], tree_.value[0][0])]) - left_child_feature_ = formatted_['Left_Child']['Feature'] - right_child_feature_ = formatted_['Right_Child']['Feature'] - left_child_threshold_ = formatted_['Left_Child']['Threshold'] - right_child_threshold_ = formatted_['Right_Child']['Threshold'] - left_child_value_ = formatted_['Left_Child']['Value'] - right_child_value_ = formatted_['Right_Child']['Value'] + assert formatted_["Feature"] == feature_names[tree_.feature[0]] + assert formatted_["Threshold"] == tree_.threshold[0] + assert all([a == b for a, b in zip(formatted_["Value"][0], tree_.value[0][0])]) + left_child_feature_ = formatted_["Left_Child"]["Feature"] + right_child_feature_ = formatted_["Right_Child"]["Feature"] + left_child_threshold_ = formatted_["Left_Child"]["Threshold"] + right_child_threshold_ = formatted_["Right_Child"]["Threshold"] + left_child_value_ = formatted_["Left_Child"]["Value"] + right_child_value_ = formatted_["Right_Child"]["Value"] assert left_child_feature_ == feature_names[tree_.feature[tree_.children_left[0]]] assert right_child_feature_ == feature_names[tree_.feature[tree_.children_right[0]]] assert left_child_threshold_ == tree_.threshold[tree_.children_left[0]] assert right_child_threshold_ == tree_.threshold[tree_.children_right[0]] # Check that the immediate left and right child of the root node have the correct values - assert all([a == b for a, b in zip(left_child_value_[0], tree_.value[tree_.children_left[0]][0])]) - assert all([a == b for a, b in zip(right_child_value_[0], tree_.value[tree_.children_right[0]][0])]) + assert all( + [ + a == b + for a, b in zip( + left_child_value_[0], tree_.value[tree_.children_left[0]][0] + ) + ] + ) + assert all( + [ + a == b + for a, b in zip( + right_child_value_[0], tree_.value[tree_.children_right[0]][0] + ) + ] + ) def test_visualize_decision_trees_filepath(fitted_tree_estimators, tmpdir): - graphviz = pytest.importorskip('graphviz', reason='Skipping visualizing test because graphviz not installed') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping visualizing test because graphviz not installed" + ) est_class, _ = fitted_tree_estimators - filepath = os.path.join(str(tmpdir), 'invalid', 'path', 'test.png') + filepath = os.path.join(str(tmpdir), "invalid", "path", "test.png") assert not os.path.exists(filepath) with pytest.raises(ValueError, match="Specified filepath is not writeable"): visualize_decision_tree(estimator=est_class, filepath=filepath) - filepath = os.path.join(str(tmpdir), 'test_0.png') + filepath = os.path.join(str(tmpdir), "test_0.png") src = visualize_decision_tree(estimator=est_class, filepath=filepath) assert os.path.exists(filepath) - assert src.format == 'png' + assert src.format == "png" assert isinstance(src, graphviz.Source) def test_visualize_decision_trees_wrong_format(fitted_tree_estimators, tmpdir): - graphviz = pytest.importorskip('graphviz', reason='Skipping visualizing test because graphviz not installed') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping visualizing test because graphviz not installed" + ) est_class, _ = fitted_tree_estimators - filepath = os.path.join(str(tmpdir), 'test_0.xyz') - with pytest.raises(ValueError, match=f"Unknown format 'xyz'. Make sure your format is one of the following: " - f"{graphviz.backend.FORMATS}"): + filepath = os.path.join(str(tmpdir), "test_0.xyz") + with pytest.raises( + ValueError, + match=f"Unknown format 'xyz'. Make sure your format is one of the following: " + f"{graphviz.backend.FORMATS}", + ): visualize_decision_tree(estimator=est_class, filepath=filepath) def test_visualize_decision_trees_est_wrong_type(logit_estimator, tmpdir): est_logit = logit_estimator - filepath = os.path.join(str(tmpdir), 'test_1.png') - with pytest.raises(ValueError, match="Tree visualizations are only supported for decision tree estimators"): + filepath = os.path.join(str(tmpdir), "test_1.png") + with pytest.raises( + ValueError, + match="Tree visualizations are only supported for decision tree estimators", + ): visualize_decision_tree(estimator=est_logit, filepath=filepath) def test_visualize_decision_trees_max_depth(tree_estimators, tmpdir): est_class, _ = tree_estimators - filepath = os.path.join(str(tmpdir), 'test_1.png') - with pytest.raises(ValueError, match="Unknown value: '-1'. The parameter max_depth has to be a non-negative integer"): + filepath = os.path.join(str(tmpdir), "test_1.png") + with pytest.raises( + ValueError, + match="Unknown value: '-1'. The parameter max_depth has to be a non-negative integer", + ): visualize_decision_tree(estimator=est_class, max_depth=-1, filepath=filepath) def test_visualize_decision_trees_not_fitted(tree_estimators, tmpdir): est_class, _ = tree_estimators - filepath = os.path.join(str(tmpdir), 'test_1.png') - with pytest.raises(NotFittedError, match="This DecisionTree estimator is not fitted yet. Call 'fit' with " - "appropriate arguments before using this estimator."): + filepath = os.path.join(str(tmpdir), "test_1.png") + with pytest.raises( + NotFittedError, + match="This DecisionTree estimator is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator.", + ): visualize_decision_tree(estimator=est_class, max_depth=3, filepath=filepath) def test_visualize_decision_trees(fitted_tree_estimators, tmpdir): - graphviz = pytest.importorskip('graphviz', reason='Skipping visualizing test because graphviz not installed') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping visualizing test because graphviz not installed" + ) est_class, est_reg = fitted_tree_estimators - filepath = os.path.join(str(tmpdir), 'test_2') - src = visualize_decision_tree(estimator=est_class, filled=True, max_depth=3, rotate=True, filepath=filepath) - assert src.format == 'pdf' # Check that extension defaults to pdf + filepath = os.path.join(str(tmpdir), "test_2") + src = visualize_decision_tree( + estimator=est_class, filled=True, max_depth=3, rotate=True, filepath=filepath + ) + assert src.format == "pdf" # Check that extension defaults to pdf assert isinstance(src, graphviz.Source) - filepath = os.path.join(str(tmpdir), 'test_3.pdf') + filepath = os.path.join(str(tmpdir), "test_3.pdf") src = visualize_decision_tree(estimator=est_reg, filled=True, filepath=filepath) - assert src.format == 'pdf' + assert src.format == "pdf" assert isinstance(src, graphviz.Source) src = visualize_decision_tree(estimator=est_reg, filled=True, max_depth=2) - assert src.format == 'pdf' + assert src.format == "pdf" assert isinstance(src, graphviz.Source) def test_linear_coefficients_errors(): dt = DecisionTreeRegressor() - with pytest.raises(ValueError, match="Linear coefficients are only available for linear family models"): + with pytest.raises( + ValueError, + match="Linear coefficients are only available for linear family models", + ): get_linear_coefficients(dt) lin = LinearRegressor() @@ -1048,26 +1301,36 @@ def test_linear_coefficients_errors(): @pytest.mark.parametrize("estimator", [LinearRegressor, ElasticNetRegressor]) def test_linear_coefficients_output(estimator): - X = pd.DataFrame([[1, 2, 3, 5], - [3, 5, 2, 1], - [5, 2, 2, 2], - [3, 2, 3, 3]], columns=['First', 'Second', 'Third', 'Fourth']) + X = pd.DataFrame( + [[1, 2, 3, 5], [3, 5, 2, 1], [5, 2, 2, 2], [3, 2, 3, 3]], + columns=["First", "Second", "Third", "Fourth"], + ) y = pd.Series([2, 1, 3, 4]) est_ = estimator() est_.fit(X, y) - output_ = get_linear_coefficients(est_, features=['First', 'Second', 'Third', 'Fourth']) - assert (output_.index == ['Intercept', 'Second', 'Fourth', 'First', 'Third']).all() + output_ = get_linear_coefficients( + est_, features=["First", "Second", "Third", "Fourth"] + ) + assert (output_.index == ["Intercept", "Second", "Fourth", "First", "Third"]).all() assert output_.shape[0] == X.shape[1] + 1 - assert (pd.Series(est_._component_obj.intercept_, index=['Intercept']).append(pd.Series(est_.feature_importance).sort_values()) == output_.values).all() + assert ( + pd.Series(est_._component_obj.intercept_, index=["Intercept"]).append( + pd.Series(est_.feature_importance).sort_values() + ) + == output_.values + ).all() @pytest.mark.parametrize("n_components", [2.0, -2, 0]) def test_t_sne_errors_n_components(n_components): X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - with pytest.raises(ValueError, match=f"The parameter n_components must be of type integer and greater than 0"): + with pytest.raises( + ValueError, + match=f"The parameter n_components must be of type integer and greater than 0", + ): t_sne(X, n_components=n_components) @@ -1075,17 +1338,19 @@ def test_t_sne_errors_n_components(n_components): def test_t_sne_errors_perplexity(perplexity): X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - with pytest.raises(ValueError, match=f"The parameter perplexity must be non-negative"): + with pytest.raises( + ValueError, match=f"The parameter perplexity must be non-negative" + ): t_sne(X, perplexity=perplexity) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_t_sne(data_type): - if data_type == 'np': + if data_type == "np": X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - elif data_type == 'pd': + elif data_type == "pd": X = pd.DataFrame([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - elif data_type == 'ww': + elif data_type == "ww": X = pd.DataFrame(np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])) X.ww.init() @@ -1099,7 +1364,9 @@ def test_t_sne_errors_marker_line_width(marker_line_width, has_minimal_dependenc pytest.skip("Skipping plotting test because plotly not installed") X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - with pytest.raises(ValueError, match=f"The parameter marker_line_width must be non-negative"): + with pytest.raises( + ValueError, match=f"The parameter marker_line_width must be non-negative" + ): graph_t_sne(X, marker_line_width=marker_line_width) @@ -1109,28 +1376,40 @@ def test_t_sne_errors_marker_size(marker_size, has_minimal_dependencies): pytest.skip("Skipping plotting test because plotly not installed") X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - with pytest.raises(ValueError, match=f"The parameter marker_size must be non-negative"): + with pytest.raises( + ValueError, match=f"The parameter marker_size must be non-negative" + ): graph_t_sne(X, marker_size=marker_size) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) @pytest.mark.parametrize("perplexity", [0, 4.6, 100]) @pytest.mark.parametrize("learning_rate", [100.0, -15, 0]) def test_graph_t_sne(data_type, perplexity, learning_rate): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') - if data_type == 'np': + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) + if data_type == "np": X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - elif data_type == 'pd': + elif data_type == "pd": X = pd.DataFrame([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - elif data_type == 'ww': + elif data_type == "ww": X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) X = infer_feature_types(X) for width_, size_ in [(3, 2), (2, 3), (1, 4)]: - fig = graph_t_sne(X, n_components=2, perplexity=perplexity, learning_rate=learning_rate, marker_line_width=width_, marker_size=size_) + fig = graph_t_sne( + X, + n_components=2, + perplexity=perplexity, + learning_rate=learning_rate, + marker_line_width=width_, + marker_size=size_, + ) assert isinstance(fig, go.Figure) - fig_dict_data = fig.to_dict()['data'][0] - assert fig_dict_data['marker']['line']['width'] == width_ - assert fig_dict_data['marker']['size'] == size_ - assert fig_dict_data['mode'] == 'markers' - assert fig_dict_data['type'] == 'scatter' + fig_dict_data = fig.to_dict()["data"][0] + assert fig_dict_data["marker"]["line"]["width"] == width_ + assert fig_dict_data["marker"]["size"] == size_ + assert fig_dict_data["mode"] == "markers" + assert fig_dict_data["type"] == "scatter" diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 2ba4b529dc..c37e363759 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -5,15 +5,12 @@ from evalml.demos import load_breast_cancer, load_fraud, load_wine from evalml.exceptions import NullsInColumnWarning -from evalml.model_understanding import ( - graph_partial_dependence, - partial_dependence -) +from evalml.model_understanding import graph_partial_dependence, partial_dependence from evalml.pipelines import ( BinaryClassificationPipeline, ClassificationPipeline, MulticlassClassificationPipeline, - RegressionPipeline + RegressionPipeline, ) from evalml.problem_types import ProblemTypes @@ -21,7 +18,12 @@ @pytest.fixture def test_pipeline(): class TestPipeline(BinaryClassificationPipeline): - component_graph = ['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'] + component_graph = [ + "Simple Imputer", + "One Hot Encoder", + "Standard Scaler", + "Logistic Regression Classifier", + ] def __init__(self, parameters, random_seed=0): super().__init__(self.component_graph, parameters=parameters) @@ -41,32 +43,52 @@ def check_partial_dependence_dataframe(pipeline, part_dep, grid_size=20): columns = ["feature_values", "partial_dependence"] if isinstance(pipeline, ClassificationPipeline): columns.append("class_label") - n_rows_for_class = len(pipeline.classes_) if isinstance(pipeline, MulticlassClassificationPipeline) else 1 + n_rows_for_class = ( + len(pipeline.classes_) + if isinstance(pipeline, MulticlassClassificationPipeline) + else 1 + ) assert list(part_dep.columns) == columns assert len(part_dep["partial_dependence"]) == grid_size * n_rows_for_class assert len(part_dep["feature_values"]) == grid_size * n_rows_for_class if isinstance(pipeline, ClassificationPipeline): - per_class_counts = part_dep['class_label'].value_counts() + per_class_counts = part_dep["class_label"].value_counts() assert all(value == grid_size for value in per_class_counts.values) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_partial_dependence_problem_types(data_type, problem_type, X_y_binary, X_y_multi, X_y_regression, - logistic_regression_binary_pipeline_class, - logistic_regression_multiclass_pipeline_class, - linear_regression_pipeline_class, make_data_type): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_partial_dependence_problem_types( + data_type, + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, + logistic_regression_binary_pipeline_class, + logistic_regression_multiclass_pipeline_class, + linear_regression_pipeline_class, + make_data_type, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression - pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) + pipeline = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) X = make_data_type(data_type, X) pipeline.fit(X, y) @@ -75,35 +97,55 @@ def test_partial_dependence_problem_types(data_type, problem_type, X_y_binary, X assert not part_dep.isnull().any(axis=None) -def test_partial_dependence_string_feature_name(logistic_regression_binary_pipeline_class): +def test_partial_dependence_string_feature_name( + logistic_regression_binary_pipeline_class, +): X, y = load_breast_cancer() - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) - part_dep = partial_dependence(pipeline, X, features="mean radius", grid_resolution=20) - assert list(part_dep.columns) == ["feature_values", "partial_dependence", "class_label"] + part_dep = partial_dependence( + pipeline, X, features="mean radius", grid_resolution=20 + ) + assert list(part_dep.columns) == [ + "feature_values", + "partial_dependence", + "class_label", + ] assert len(part_dep["partial_dependence"]) == 20 assert len(part_dep["feature_values"]) == 20 assert not part_dep.isnull().any(axis=None) @pytest.mark.parametrize("data_type", ["pd", "ww"]) -def test_partial_dependence_with_non_numeric_columns(data_type, linear_regression_pipeline_class, logistic_regression_binary_pipeline_class): - X = pd.DataFrame({'numeric': [1, 2, 3, 0], - 'also numeric': [2, 3, 4, 1], - 'string': ['a', 'b', 'a', 'c'], - 'also string': ['c', 'b', 'a', 'd']}) +def test_partial_dependence_with_non_numeric_columns( + data_type, + linear_regression_pipeline_class, + logistic_regression_binary_pipeline_class, +): + X = pd.DataFrame( + { + "numeric": [1, 2, 3, 0], + "also numeric": [2, 3, 4, 1], + "string": ["a", "b", "a", "c"], + "also string": ["c", "b", "a", "d"], + } + ) if data_type == "ww": X.ww.init() y = [0, 0.2, 1.4, 1] - pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) + pipeline = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) pipeline.fit(X, y) - part_dep = partial_dependence(pipeline, X, features='numeric') + part_dep = partial_dependence(pipeline, X, features="numeric") assert list(part_dep.columns) == ["feature_values", "partial_dependence"] assert len(part_dep["partial_dependence"]) == 4 assert len(part_dep["feature_values"]) == 4 assert not part_dep.isnull().any(axis=None) - part_dep = partial_dependence(pipeline, X, features='string') + part_dep = partial_dependence(pipeline, X, features="string") assert list(part_dep.columns) == ["feature_values", "partial_dependence"] assert len(part_dep["partial_dependence"]) == 3 assert len(part_dep["feature_values"]) == 3 @@ -113,62 +155,90 @@ def test_partial_dependence_with_non_numeric_columns(data_type, linear_regressio def test_partial_dependence_baseline(): X = pd.DataFrame([[1, 0], [0, 1]]) y = pd.Series([0, 1]) - pipeline = BinaryClassificationPipeline(component_graph=["Baseline Classifier"], parameters={}) + pipeline = BinaryClassificationPipeline( + component_graph=["Baseline Classifier"], parameters={} + ) pipeline.fit(X, y) - with pytest.raises(ValueError, match="Partial dependence plots are not supported for Baseline pipelines"): + with pytest.raises( + ValueError, + match="Partial dependence plots are not supported for Baseline pipelines", + ): partial_dependence(pipeline, X, features=0, grid_resolution=20) @pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -def test_partial_dependence_catboost(problem_type, X_y_binary, X_y_multi, has_minimal_dependencies): +def test_partial_dependence_catboost( + problem_type, X_y_binary, X_y_multi, has_minimal_dependencies +): if not has_minimal_dependencies: if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - y_small = ['a', 'b', 'a'] + y_small = ["a", "b", "a"] pipeline_class = BinaryClassificationPipeline else: X, y = X_y_multi - y_small = ['a', 'b', 'c'] + y_small = ["a", "b", "c"] pipeline_class = MulticlassClassificationPipeline - pipeline = pipeline_class(component_graph=["CatBoost Classifier"], - parameters={"CatBoost Classifier": {'thread_count': 1}}) + pipeline = pipeline_class( + component_graph=["CatBoost Classifier"], + parameters={"CatBoost Classifier": {"thread_count": 1}}, + ) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features=0, grid_resolution=20) check_partial_dependence_dataframe(pipeline, part_dep) assert not part_dep.isnull().all().all() # test that CatBoost can natively handle non-numerical columns as feature passed to partial_dependence - X = pd.DataFrame({'numeric': [1, 2, 3], 'also numeric': [2, 3, 4], 'string': ['a', 'b', 'c'], 'also string': ['c', 'b', 'a']}) - pipeline = pipeline_class(component_graph=["CatBoost Classifier"], - parameters={"CatBoost Classifier": {'thread_count': 1}}) + X = pd.DataFrame( + { + "numeric": [1, 2, 3], + "also numeric": [2, 3, 4], + "string": ["a", "b", "c"], + "also string": ["c", "b", "a"], + } + ) + pipeline = pipeline_class( + component_graph=["CatBoost Classifier"], + parameters={"CatBoost Classifier": {"thread_count": 1}}, + ) pipeline.fit(X, y_small) - part_dep = partial_dependence(pipeline, X, features='string') + part_dep = partial_dependence(pipeline, X, features="string") check_partial_dependence_dataframe(pipeline, part_dep, grid_size=3) assert not part_dep.isnull().all().all() -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_partial_dependence_xgboost_feature_names(problem_type, has_minimal_dependencies, - X_y_binary, X_y_multi, X_y_regression): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_partial_dependence_xgboost_feature_names( + problem_type, has_minimal_dependencies, X_y_binary, X_y_multi, X_y_regression +): if has_minimal_dependencies: pytest.skip("Skipping because XGBoost not installed for minimal dependencies") if problem_type == ProblemTypes.REGRESSION: - pipeline = RegressionPipeline(component_graph=['Simple Imputer', 'XGBoost Regressor'], - parameters={'XGBoost Classifier': {'nthread': 1}}) + pipeline = RegressionPipeline( + component_graph=["Simple Imputer", "XGBoost Regressor"], + parameters={"XGBoost Classifier": {"nthread": 1}}, + ) X, y = X_y_regression elif problem_type == ProblemTypes.BINARY: - pipeline = BinaryClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'], - parameters={'XGBoost Classifier': {'nthread': 1}}) + pipeline = BinaryClassificationPipeline( + component_graph=["Simple Imputer", "XGBoost Classifier"], + parameters={"XGBoost Classifier": {"nthread": 1}}, + ) X, y = X_y_binary elif problem_type == ProblemTypes.MULTICLASS: - pipeline = MulticlassClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'], - parameters={'XGBoost Classifier': {'nthread': 1}}) + pipeline = MulticlassClassificationPipeline( + component_graph=["Simple Imputer", "XGBoost Classifier"], + parameters={"XGBoost Classifier": {"nthread": 1}}, + ) X, y = X_y_multi X = pd.DataFrame(X) - X = X.rename(columns={0: '<[0]'}) + X = X.rename(columns={0: "<[0]"}) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features="<[0]", grid_resolution=20) check_partial_dependence_dataframe(pipeline, part_dep) @@ -181,25 +251,32 @@ def test_partial_dependence_xgboost_feature_names(problem_type, has_minimal_depe def test_partial_dependence_multiclass(logistic_regression_multiclass_pipeline_class): X, y = load_wine() - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) num_classes = y.nunique() grid_resolution = 20 - one_way_part_dep = partial_dependence(pipeline=pipeline, - X=X, - features="magnesium", - grid_resolution=grid_resolution) + one_way_part_dep = partial_dependence( + pipeline=pipeline, X=X, features="magnesium", grid_resolution=grid_resolution + ) assert "class_label" in one_way_part_dep.columns assert one_way_part_dep["class_label"].nunique() == num_classes assert len(one_way_part_dep.index) == num_classes * grid_resolution - assert list(one_way_part_dep.columns) == ["feature_values", "partial_dependence", "class_label"] - - two_way_part_dep = partial_dependence(pipeline=pipeline, - X=X, - features=("magnesium", "alcohol"), - grid_resolution=grid_resolution) + assert list(one_way_part_dep.columns) == [ + "feature_values", + "partial_dependence", + "class_label", + ] + + two_way_part_dep = partial_dependence( + pipeline=pipeline, + X=X, + features=("magnesium", "alcohol"), + grid_resolution=grid_resolution, + ) assert "class_label" in two_way_part_dep.columns assert two_way_part_dep["class_label"].nunique() == num_classes @@ -207,71 +284,114 @@ def test_partial_dependence_multiclass(logistic_regression_multiclass_pipeline_c assert len(two_way_part_dep.columns) == grid_resolution + 1 -def test_partial_dependence_not_fitted(X_y_binary, logistic_regression_binary_pipeline_class): +def test_partial_dependence_not_fitted( + X_y_binary, logistic_regression_binary_pipeline_class +): X, y = X_y_binary - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) - with pytest.raises(ValueError, match="Pipeline to calculate partial dependence for must be fitted"): + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) + with pytest.raises( + ValueError, match="Pipeline to calculate partial dependence for must be fitted" + ): partial_dependence(pipeline, X, features=0, grid_resolution=20) def test_partial_dependence_warning(logistic_regression_binary_pipeline_class): - X = pd.DataFrame({'a': [1, 2, None, 2, 2], 'b': [1, 1, 2, 2, 1]}) + X = pd.DataFrame({"a": [1, 2, None, 2, 2], "b": [1, 1, 2, 2, 1]}) y = pd.Series([0, 1, 0, 1, 0]) - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) - with pytest.warns(NullsInColumnWarning, match="There are null values in the features, which will cause NaN values in the partial dependence output"): + with pytest.warns( + NullsInColumnWarning, + match="There are null values in the features, which will cause NaN values in the partial dependence output", + ): partial_dependence(pipeline, X, features=0, grid_resolution=20) - with pytest.warns(NullsInColumnWarning, match="There are null values in the features, which will cause NaN values in the partial dependence output"): - partial_dependence(pipeline, X, features=('a', "b"), grid_resolution=20) - with pytest.warns(NullsInColumnWarning, match="There are null values in the features, which will cause NaN values in the partial dependence output"): - partial_dependence(pipeline, X, features='a', grid_resolution=20) + with pytest.warns( + NullsInColumnWarning, + match="There are null values in the features, which will cause NaN values in the partial dependence output", + ): + partial_dependence(pipeline, X, features=("a", "b"), grid_resolution=20) + with pytest.warns( + NullsInColumnWarning, + match="There are null values in the features, which will cause NaN values in the partial dependence output", + ): + partial_dependence(pipeline, X, features="a", grid_resolution=20) def test_partial_dependence_errors(logistic_regression_binary_pipeline_class): - X = pd.DataFrame({'a': [2, None, 2, 2], 'b': [1, 2, 2, 1], 'c': [0, 0, 0, 0]}) + X = pd.DataFrame({"a": [2, None, 2, 2], "b": [1, 2, 2, 1], "c": [0, 0, 0, 0]}) y = pd.Series([0, 1, 0, 1]) - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) - with pytest.raises(ValueError, match="Too many features given to graph_partial_dependence. Only one or two-way partial dependence is supported."): - partial_dependence(pipeline, X, features=('a', 'b', 'c'), grid_resolution=20) + with pytest.raises( + ValueError, + match="Too many features given to graph_partial_dependence. Only one or two-way partial dependence is supported.", + ): + partial_dependence(pipeline, X, features=("a", "b", "c"), grid_resolution=20) - with pytest.raises(ValueError, match="Features provided must be a tuple entirely of integers or strings, not a mixture of both."): - partial_dependence(pipeline, X, features=(0, 'b')) + with pytest.raises( + ValueError, + match="Features provided must be a tuple entirely of integers or strings, not a mixture of both.", + ): + partial_dependence(pipeline, X, features=(0, "b")) -def test_partial_dependence_more_categories_than_grid_resolution(logistic_regression_binary_pipeline_class): +def test_partial_dependence_more_categories_than_grid_resolution( + logistic_regression_binary_pipeline_class, +): def round_dict_keys(dictionary, places=6): - """ Function to round all keys of a dictionary that has floats as keys. """ + """Function to round all keys of a dictionary that has floats as keys.""" dictionary_rounded = {} for key in dictionary: dictionary_rounded[round(key, places)] = dictionary[key] return dictionary_rounded X, y = load_fraud(1000) - X = X.drop(columns=['datetime', 'expiration_date', 'country', 'region', 'provider']) + X = X.drop(columns=["datetime", "expiration_date", "country", "region", "provider"]) pipeline = logistic_regression_binary_pipeline_class({}) pipeline.fit(X, y) num_cat_features = len(set(X["currency"])) assert num_cat_features == 164 - part_dep_ans = {0.1432616813857269: 154, 0.1502346349971562: 1, 0.14487916687594762: 1, - 0.1573183451314127: 1, 0.11695462432136654: 1, 0.07950579532536253: 1, 0.006794444792966759: 1, - 0.17745270478939879: 1, 0.1666874487986626: 1, 0.13357573073236878: 1, 0.06778096366056789: 1} + part_dep_ans = { + 0.1432616813857269: 154, + 0.1502346349971562: 1, + 0.14487916687594762: 1, + 0.1573183451314127: 1, + 0.11695462432136654: 1, + 0.07950579532536253: 1, + 0.006794444792966759: 1, + 0.17745270478939879: 1, + 0.1666874487986626: 1, + 0.13357573073236878: 1, + 0.06778096366056789: 1, + } part_dep_ans_rounded = round_dict_keys(part_dep_ans) # Check the case where grid_resolution < number of categorical features - part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features / 2)) + part_dep = partial_dependence( + pipeline, X, "currency", grid_resolution=round(num_cat_features / 2) + ) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution == number of categorical features - part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features)) + part_dep = partial_dependence( + pipeline, X, "currency", grid_resolution=round(num_cat_features) + ) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution > number of categorical features - part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features * 2)) + part_dep = partial_dependence( + pipeline, X, "currency", grid_resolution=round(num_cat_features * 2) + ) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) @@ -279,222 +399,389 @@ def round_dict_keys(dictionary, places=6): def test_graph_partial_dependence(test_pipeline): X, y = load_breast_cancer() - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) clf = test_pipeline clf.fit(X, y) - fig = graph_partial_dependence(clf, X, features='mean radius', grid_resolution=20) + fig = graph_partial_dependence(clf, X, features="mean radius", grid_resolution=20) assert isinstance(fig, go.Figure) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == "Partial Dependence of 'mean radius'" - assert len(fig_dict['data']) == 1 - assert fig_dict['data'][0]['name'] == "Partial Dependence" + assert fig_dict["layout"]["title"]["text"] == "Partial Dependence of 'mean radius'" + assert len(fig_dict["data"]) == 1 + assert fig_dict["data"][0]["name"] == "Partial Dependence" - part_dep_data = partial_dependence(clf, X, features='mean radius', grid_resolution=20) - assert np.array_equal(fig_dict['data'][0]['x'], part_dep_data['feature_values']) - assert np.array_equal(fig_dict['data'][0]['y'], part_dep_data['partial_dependence'].values) + part_dep_data = partial_dependence( + clf, X, features="mean radius", grid_resolution=20 + ) + assert np.array_equal(fig_dict["data"][0]["x"], part_dep_data["feature_values"]) + assert np.array_equal( + fig_dict["data"][0]["y"], part_dep_data["partial_dependence"].values + ) def test_graph_two_way_partial_dependence(test_pipeline): X, y = load_breast_cancer() - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) clf = test_pipeline clf.fit(X, y) - fig = graph_partial_dependence(clf, X, features=('mean radius', 'mean area'), grid_resolution=5) + fig = graph_partial_dependence( + clf, X, features=("mean radius", "mean area"), grid_resolution=5 + ) assert isinstance(fig, go.Figure) fig_dict = fig.to_dict() - assert fig_dict['layout']['title']['text'] == "Partial Dependence of 'mean radius' vs. 'mean area'" - assert len(fig_dict['data']) == 1 - assert fig_dict['data'][0]['name'] == "Partial Dependence" - - part_dep_data = partial_dependence(clf, X, features=('mean radius', 'mean area'), grid_resolution=5) - part_dep_data.drop(columns=['class_label'], inplace=True) - assert np.array_equal(fig_dict['data'][0]['x'], part_dep_data.columns) - assert np.array_equal(fig_dict['data'][0]['y'], part_dep_data.index) - assert np.array_equal(fig_dict['data'][0]['z'], part_dep_data.values) - - -def test_graph_partial_dependence_multiclass(logistic_regression_multiclass_pipeline_class): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + assert ( + fig_dict["layout"]["title"]["text"] + == "Partial Dependence of 'mean radius' vs. 'mean area'" + ) + assert len(fig_dict["data"]) == 1 + assert fig_dict["data"][0]["name"] == "Partial Dependence" + + part_dep_data = partial_dependence( + clf, X, features=("mean radius", "mean area"), grid_resolution=5 + ) + part_dep_data.drop(columns=["class_label"], inplace=True) + assert np.array_equal(fig_dict["data"][0]["x"], part_dep_data.columns) + assert np.array_equal(fig_dict["data"][0]["y"], part_dep_data.index) + assert np.array_equal(fig_dict["data"][0]["z"], part_dep_data.values) + + +def test_graph_partial_dependence_multiclass( + logistic_regression_multiclass_pipeline_class, +): + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = load_wine() - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) # Test one-way without class labels - fig_one_way_no_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', grid_resolution=20) + fig_one_way_no_class_labels = graph_partial_dependence( + pipeline, X, features="magnesium", grid_resolution=20 + ) assert isinstance(fig_one_way_no_class_labels, go.Figure) fig_dict = fig_one_way_no_class_labels.to_dict() - assert len(fig_dict['data']) == len(pipeline.classes_) - for data, label in zip(fig_dict['data'], pipeline.classes_): - assert len(data['x']) == 20 - assert len(data['y']) == 20 - assert data['name'] == label + assert len(fig_dict["data"]) == len(pipeline.classes_) + for data, label in zip(fig_dict["data"], pipeline.classes_): + assert len(data["x"]) == 20 + assert len(data["y"]) == 20 + assert data["name"] == label # Check that all the subplots axes have the same range - for suplot_1_axis, suplot_2_axis in [('axis2', 'axis3'), ('axis2', 'axis4'), ('axis3', 'axis4')]: - for axis_type in ['x', 'y']: - assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range'] + for suplot_1_axis, suplot_2_axis in [ + ("axis2", "axis3"), + ("axis2", "axis4"), + ("axis3", "axis4"), + ]: + for axis_type in ["x", "y"]: + assert ( + fig_dict["layout"][axis_type + suplot_1_axis]["range"] + == fig_dict["layout"][axis_type + suplot_2_axis]["range"] + ) # Test one-way with class labels - fig_one_way_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', class_label='class_1', grid_resolution=20) + fig_one_way_class_labels = graph_partial_dependence( + pipeline, X, features="magnesium", class_label="class_1", grid_resolution=20 + ) assert isinstance(fig_one_way_class_labels, go.Figure) fig_dict = fig_one_way_class_labels.to_dict() - assert len(fig_dict['data']) == 1 - assert len(fig_dict['data'][0]['x']) == 20 - assert len(fig_dict['data'][0]['y']) == 20 - assert fig_dict['data'][0]['name'] == 'class_1' + assert len(fig_dict["data"]) == 1 + assert len(fig_dict["data"][0]["x"]) == 20 + assert len(fig_dict["data"][0]["y"]) == 20 + assert fig_dict["data"][0]["name"] == "class_1" msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2" with pytest.raises(ValueError, match=msg): - graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine') + graph_partial_dependence(pipeline, X, features="alcohol", class_label="wine") # Test two-way without class labels - fig_two_way_no_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), grid_resolution=20) + fig_two_way_no_class_labels = graph_partial_dependence( + pipeline, X, features=("magnesium", "alcohol"), grid_resolution=20 + ) assert isinstance(fig_two_way_no_class_labels, go.Figure) fig_dict = fig_two_way_no_class_labels.to_dict() - assert len(fig_dict['data']) == 3, "Figure does not have partial dependence data for each class." - assert all([len(fig_dict["data"][i]['x']) == 20 for i in range(3)]) - assert all([len(fig_dict["data"][i]['y']) == 20 for i in range(3)]) - assert [fig_dict["data"][i]['name'] for i in range(3)] == ["class_0", "class_1", "class_2"] + assert ( + len(fig_dict["data"]) == 3 + ), "Figure does not have partial dependence data for each class." + assert all([len(fig_dict["data"][i]["x"]) == 20 for i in range(3)]) + assert all([len(fig_dict["data"][i]["y"]) == 20 for i in range(3)]) + assert [fig_dict["data"][i]["name"] for i in range(3)] == [ + "class_0", + "class_1", + "class_2", + ] # Check that all the subplots axes have the same range - for suplot_1_axis, suplot_2_axis in [('axis', 'axis2'), ('axis', 'axis3'), ('axis2', 'axis3')]: - for axis_type in ['x', 'y']: - assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range'] + for suplot_1_axis, suplot_2_axis in [ + ("axis", "axis2"), + ("axis", "axis3"), + ("axis2", "axis3"), + ]: + for axis_type in ["x", "y"]: + assert ( + fig_dict["layout"][axis_type + suplot_1_axis]["range"] + == fig_dict["layout"][axis_type + suplot_2_axis]["range"] + ) # Test two-way with class labels - fig_two_way_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), class_label='class_1', grid_resolution=20) + fig_two_way_class_labels = graph_partial_dependence( + pipeline, + X, + features=("magnesium", "alcohol"), + class_label="class_1", + grid_resolution=20, + ) assert isinstance(fig_two_way_class_labels, go.Figure) fig_dict = fig_two_way_class_labels.to_dict() - assert len(fig_dict['data']) == 1 - assert len(fig_dict['data'][0]['x']) == 20 - assert len(fig_dict['data'][0]['y']) == 20 - assert fig_dict['data'][0]['name'] == 'class_1' + assert len(fig_dict["data"]) == 1 + assert len(fig_dict["data"][0]["x"]) == 20 + assert len(fig_dict["data"][0]["y"]) == 20 + assert fig_dict["data"][0]["name"] == "class_1" msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2" with pytest.raises(ValueError, match=msg): - graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine') + graph_partial_dependence(pipeline, X, features="alcohol", class_label="wine") -def test_partial_dependence_percentile_errors(logistic_regression_binary_pipeline_class): +def test_partial_dependence_percentile_errors( + logistic_regression_binary_pipeline_class, +): # random_col will be 5% 0, 95% 1 - X = pd.DataFrame({"A": [i % 3 for i in range(1000)], "B": [(j + 3) % 5 for j in range(1000)], - "random_col": [0 if i < 50 else 1 for i in range(1000)], - "random_col_2": [0 if i < 40 else 1 for i in range(1000)]}) + X = pd.DataFrame( + { + "A": [i % 3 for i in range(1000)], + "B": [(j + 3) % 5 for j in range(1000)], + "random_col": [0 if i < 50 else 1 for i in range(1000)], + "random_col_2": [0 if i < 40 else 1 for i in range(1000)], + } + ) y = pd.Series([i % 2 for i in range(1000)]) - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) - with pytest.raises(ValueError, match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be"): + with pytest.raises( + ValueError, + match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be", + ): partial_dependence(pipeline, X, features="random_col", grid_resolution=20) - with pytest.raises(ValueError, match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be"): - partial_dependence(pipeline, X, features="random_col", percentiles=(0.01, 0.955), grid_resolution=20) - with pytest.raises(ValueError, match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be"): - partial_dependence(pipeline, X, features=2, percentiles=(0.01, 0.955), grid_resolution=20) - with pytest.raises(ValueError, match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be"): - partial_dependence(pipeline, X, features=('A', "random_col"), percentiles=(0.01, 0.955), grid_resolution=20) - with pytest.raises(ValueError, match="Features \\('random_col', 'random_col_2'\\) are mostly one value, \\(1, 1\\), and cannot be"): - partial_dependence(pipeline, X, features=("random_col", "random_col_2"), - percentiles=(0.01, 0.955), grid_resolution=20) - - part_dep = partial_dependence(pipeline, X, features="random_col", percentiles=(0.01, 0.96), grid_resolution=20) - assert list(part_dep.columns) == ["feature_values", "partial_dependence", "class_label"] + with pytest.raises( + ValueError, + match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be", + ): + partial_dependence( + pipeline, + X, + features="random_col", + percentiles=(0.01, 0.955), + grid_resolution=20, + ) + with pytest.raises( + ValueError, + match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be", + ): + partial_dependence( + pipeline, X, features=2, percentiles=(0.01, 0.955), grid_resolution=20 + ) + with pytest.raises( + ValueError, + match="Features \\('random_col'\\) are mostly one value, \\(1\\), and cannot be", + ): + partial_dependence( + pipeline, + X, + features=("A", "random_col"), + percentiles=(0.01, 0.955), + grid_resolution=20, + ) + with pytest.raises( + ValueError, + match="Features \\('random_col', 'random_col_2'\\) are mostly one value, \\(1, 1\\), and cannot be", + ): + partial_dependence( + pipeline, + X, + features=("random_col", "random_col_2"), + percentiles=(0.01, 0.955), + grid_resolution=20, + ) + + part_dep = partial_dependence( + pipeline, X, features="random_col", percentiles=(0.01, 0.96), grid_resolution=20 + ) + assert list(part_dep.columns) == [ + "feature_values", + "partial_dependence", + "class_label", + ] assert len(part_dep["partial_dependence"]) == 2 assert len(part_dep["feature_values"]) == 2 assert not part_dep.isnull().any(axis=None) -@pytest.mark.parametrize('problem_type', ['binary', 'regression']) -def test_graph_partial_dependence_regression_and_binary_categorical(problem_type, linear_regression_pipeline_class, - X_y_regression, X_y_binary, - logistic_regression_binary_pipeline_class): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') - - if problem_type == 'binary': +@pytest.mark.parametrize("problem_type", ["binary", "regression"]) +def test_graph_partial_dependence_regression_and_binary_categorical( + problem_type, + linear_regression_pipeline_class, + X_y_regression, + X_y_binary, + logistic_regression_binary_pipeline_class, +): + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) + + if problem_type == "binary": X, y = X_y_binary - pipeline = logistic_regression_binary_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + {"Logistic Regression Classifier": {"n_jobs": 1}} + ) else: X, y = X_y_regression pipeline = linear_regression_pipeline_class({"Linear Regressor": {"n_jobs": 1}}) X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y = pd.Series(y) - X['categorical_column'] = pd.Series([i % 3 for i in range(X.shape[0])]).astype('str') - X['categorical_column_2'] = pd.Series([i % 6 for i in range(X.shape[0])]).astype('str') + X["categorical_column"] = pd.Series([i % 3 for i in range(X.shape[0])]).astype( + "str" + ) + X["categorical_column_2"] = pd.Series([i % 6 for i in range(X.shape[0])]).astype( + "str" + ) pipeline.fit(X, y) - fig = graph_partial_dependence(pipeline, X, features='categorical_column', grid_resolution=5) - plot_data = fig.to_dict()['data'][0] - assert plot_data['type'] == 'bar' - assert plot_data['x'].tolist() == ['0', '1', '2'] + fig = graph_partial_dependence( + pipeline, X, features="categorical_column", grid_resolution=5 + ) + plot_data = fig.to_dict()["data"][0] + assert plot_data["type"] == "bar" + assert plot_data["x"].tolist() == ["0", "1", "2"] - fig = graph_partial_dependence(pipeline, X, features=('0', 'categorical_column'), - grid_resolution=5) + fig = graph_partial_dependence( + pipeline, X, features=("0", "categorical_column"), grid_resolution=5 + ) fig_dict = fig.to_dict() - plot_data = fig_dict['data'][0] - assert plot_data['type'] == 'contour' - assert fig_dict['layout']['yaxis']['ticktext'] == ['0', '1', '2'] - assert fig_dict['layout']['title']['text'] == "Partial Dependence of 'categorical_column' vs. '0'" - - fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'), - grid_resolution=5) + plot_data = fig_dict["data"][0] + assert plot_data["type"] == "contour" + assert fig_dict["layout"]["yaxis"]["ticktext"] == ["0", "1", "2"] + assert ( + fig_dict["layout"]["title"]["text"] + == "Partial Dependence of 'categorical_column' vs. '0'" + ) + + fig = graph_partial_dependence( + pipeline, + X, + features=("categorical_column_2", "categorical_column"), + grid_resolution=5, + ) fig_dict = fig.to_dict() - plot_data = fig_dict['data'][0] - assert plot_data['type'] == 'contour' - assert fig_dict['layout']['xaxis']['ticktext'] == ['0', '1', '2'] - assert fig_dict['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5'] - assert fig_dict['layout']['title']['text'] == "Partial Dependence of 'categorical_column_2' vs. 'categorical_column'" - - -@pytest.mark.parametrize('class_label', [None, 'class_1']) -def test_partial_dependence_multiclass_categorical(class_label, - logistic_regression_multiclass_pipeline_class): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + plot_data = fig_dict["data"][0] + assert plot_data["type"] == "contour" + assert fig_dict["layout"]["xaxis"]["ticktext"] == ["0", "1", "2"] + assert fig_dict["layout"]["yaxis"]["ticktext"] == ["0", "1", "2", "3", "4", "5"] + assert ( + fig_dict["layout"]["title"]["text"] + == "Partial Dependence of 'categorical_column_2' vs. 'categorical_column'" + ) + + +@pytest.mark.parametrize("class_label", [None, "class_1"]) +def test_partial_dependence_multiclass_categorical( + class_label, logistic_regression_multiclass_pipeline_class +): + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = load_wine() - X.ww['categorical_column'] = ww.init_series(pd.Series([i % 3 for i in range(X.shape[0])]).astype(str), logical_type="Categorical") - X.ww['categorical_column_2'] = ww.init_series(pd.Series([i % 6 for i in range(X.shape[0])]).astype(str), logical_type="Categorical") - - pipeline = logistic_regression_multiclass_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}}) + X.ww["categorical_column"] = ww.init_series( + pd.Series([i % 3 for i in range(X.shape[0])]).astype(str), + logical_type="Categorical", + ) + X.ww["categorical_column_2"] = ww.init_series( + pd.Series([i % 6 for i in range(X.shape[0])]).astype(str), + logical_type="Categorical", + ) + + pipeline = logistic_regression_multiclass_pipeline_class( + {"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) - fig = graph_partial_dependence(pipeline, X, features='categorical_column', class_label=class_label, - grid_resolution=5) - - for i, plot_data in enumerate(fig.to_dict()['data']): - assert plot_data['type'] == 'bar' - assert plot_data['x'].tolist() == ['0', '1', '2'] + fig = graph_partial_dependence( + pipeline, + X, + features="categorical_column", + class_label=class_label, + grid_resolution=5, + ) + + for i, plot_data in enumerate(fig.to_dict()["data"]): + assert plot_data["type"] == "bar" + assert plot_data["x"].tolist() == ["0", "1", "2"] if class_label is None: - assert plot_data['name'] == f'class_{i}' + assert plot_data["name"] == f"class_{i}" else: - assert plot_data['name'] == class_label - - fig = graph_partial_dependence(pipeline, X, features=('alcohol', 'categorical_column'), class_label=class_label, - grid_resolution=5) - - for i, plot_data in enumerate(fig.to_dict()['data']): - assert plot_data['type'] == 'contour' - assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2'] + assert plot_data["name"] == class_label + + fig = graph_partial_dependence( + pipeline, + X, + features=("alcohol", "categorical_column"), + class_label=class_label, + grid_resolution=5, + ) + + for i, plot_data in enumerate(fig.to_dict()["data"]): + assert plot_data["type"] == "contour" + assert fig.to_dict()["layout"]["yaxis"]["ticktext"] == ["0", "1", "2"] if class_label is None: - assert plot_data['name'] == f'class_{i}' + assert plot_data["name"] == f"class_{i}" else: - assert plot_data['name'] == class_label - - fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'), - class_label=class_label, grid_resolution=5) - - for i, plot_data in enumerate(fig.to_dict()['data']): - assert plot_data['type'] == 'contour' - assert fig.to_dict()['layout']['xaxis']['ticktext'] == ['0', '1', '2'] - assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5'] + assert plot_data["name"] == class_label + + fig = graph_partial_dependence( + pipeline, + X, + features=("categorical_column_2", "categorical_column"), + class_label=class_label, + grid_resolution=5, + ) + + for i, plot_data in enumerate(fig.to_dict()["data"]): + assert plot_data["type"] == "contour" + assert fig.to_dict()["layout"]["xaxis"]["ticktext"] == ["0", "1", "2"] + assert fig.to_dict()["layout"]["yaxis"]["ticktext"] == [ + "0", + "1", + "2", + "3", + "4", + "5", + ] if class_label is None: - assert plot_data['name'] == f'class_{i}' + assert plot_data["name"] == f"class_{i}" else: - assert plot_data['name'] == class_label + assert plot_data["name"] == class_label -def test_partial_dependence_all_nan_value_error(logistic_regression_binary_pipeline_class): +def test_partial_dependence_all_nan_value_error( + logistic_regression_binary_pipeline_class, +): pl = logistic_regression_binary_pipeline_class({}) X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) @@ -517,25 +804,51 @@ def test_partial_dependence_all_nan_value_error(logistic_regression_binary_pipel partial_dependence(pl, pred_df, features=0, grid_resolution=10) -@pytest.mark.parametrize('problem_type', ['binary', 'multiclass', 'regression']) -def test_partial_dependence_datetime(problem_type, X_y_regression, X_y_binary, X_y_multi): - if problem_type == 'binary': +@pytest.mark.parametrize("problem_type", ["binary", "multiclass", "regression"]) +def test_partial_dependence_datetime( + problem_type, X_y_regression, X_y_binary, X_y_multi +): + if problem_type == "binary": X, y = X_y_binary - pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier']) - elif problem_type == 'multiclass': + pipeline = BinaryClassificationPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "Standard Scaler", + "Logistic Regression Classifier", + ] + ) + elif problem_type == "multiclass": X, y = X_y_multi - pipeline = MulticlassClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier']) + pipeline = MulticlassClassificationPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "Standard Scaler", + "Logistic Regression Classifier", + ] + ) else: X, y = X_y_regression - pipeline = RegressionPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Linear Regressor']) + pipeline = RegressionPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "Standard Scaler", + "Linear Regressor", + ] + ) X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y = pd.Series(y) - X['dt_column'] = pd.Series(pd.date_range('20200101', periods=X.shape[0])) + X["dt_column"] = pd.Series(pd.date_range("20200101", periods=X.shape[0])) pipeline.fit(X, y) - part_dep = partial_dependence(pipeline, X, features='dt_column') - if problem_type == 'multiclass': + part_dep = partial_dependence(pipeline, X, features="dt_column") + if problem_type == "multiclass": assert len(part_dep["partial_dependence"]) == 300 # 100 rows * 3 classes assert len(part_dep["feature_values"]) == 300 else: @@ -544,7 +857,7 @@ def test_partial_dependence_datetime(problem_type, X_y_regression, X_y_binary, X assert not part_dep.isnull().any(axis=None) part_dep = partial_dependence(pipeline, X, features=20) - if problem_type == 'multiclass': + if problem_type == "multiclass": assert len(part_dep["partial_dependence"]) == 300 # 100 rows * 3 classes assert len(part_dep["feature_values"]) == 300 else: @@ -552,62 +865,116 @@ def test_partial_dependence_datetime(problem_type, X_y_regression, X_y_binary, X assert len(part_dep["feature_values"]) == 100 assert not part_dep.isnull().any(axis=None) - with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'): - part_dep = partial_dependence(pipeline, X, features=('0', 'dt_column')) - with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'): + with pytest.raises( + ValueError, + match="Two-way partial dependence is not supported for datetime columns.", + ): + part_dep = partial_dependence(pipeline, X, features=("0", "dt_column")) + with pytest.raises( + ValueError, + match="Two-way partial dependence is not supported for datetime columns.", + ): part_dep = partial_dependence(pipeline, X, features=(0, 20)) -@pytest.mark.parametrize('problem_type', ['binary', 'regression']) -def test_graph_partial_dependence_regression_and_binary_datetime(problem_type, X_y_regression, X_y_binary): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') +@pytest.mark.parametrize("problem_type", ["binary", "regression"]) +def test_graph_partial_dependence_regression_and_binary_datetime( + problem_type, X_y_regression, X_y_binary +): + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) - if problem_type == 'binary': + if problem_type == "binary": X, y = X_y_binary - pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier']) + pipeline = BinaryClassificationPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "Standard Scaler", + "Logistic Regression Classifier", + ] + ) else: X, y = X_y_regression - pipeline = RegressionPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Linear Regressor']) + pipeline = RegressionPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "Standard Scaler", + "Linear Regressor", + ] + ) X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y = pd.Series(y) - X['dt_column'] = pd.to_datetime(pd.Series(pd.date_range('20200101', periods=X.shape[0])), errors='coerce') + X["dt_column"] = pd.to_datetime( + pd.Series(pd.date_range("20200101", periods=X.shape[0])), errors="coerce" + ) pipeline.fit(X, y) - fig = graph_partial_dependence(pipeline, X, features='dt_column', grid_resolution=5) - plot_data = fig.to_dict()['data'][0] - assert plot_data['type'] == 'scatter' - assert plot_data['x'].tolist() == list(pd.date_range('20200101', periods=X.shape[0])) + fig = graph_partial_dependence(pipeline, X, features="dt_column", grid_resolution=5) + plot_data = fig.to_dict()["data"][0] + assert plot_data["type"] == "scatter" + assert plot_data["x"].tolist() == list( + pd.date_range("20200101", periods=X.shape[0]) + ) def test_graph_partial_dependence_regression_date_order(X_y_binary): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary - pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier']) + pipeline = BinaryClassificationPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "Standard Scaler", + "Logistic Regression Classifier", + ] + ) X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y = pd.Series(y) - dt_series = pd.Series(pd.date_range('20200101', periods=X.shape[0])).sample(frac=1).reset_index(drop=True) - X['dt_column'] = pd.to_datetime(dt_series, errors='coerce') + dt_series = ( + pd.Series(pd.date_range("20200101", periods=X.shape[0])) + .sample(frac=1) + .reset_index(drop=True) + ) + X["dt_column"] = pd.to_datetime(dt_series, errors="coerce") pipeline.fit(X, y) - fig = graph_partial_dependence(pipeline, X, features='dt_column', grid_resolution=5) - plot_data = fig.to_dict()['data'][0] - assert plot_data['type'] == 'scatter' - assert plot_data['x'].tolist() == list(pd.date_range('20200101', periods=X.shape[0])) + fig = graph_partial_dependence(pipeline, X, features="dt_column", grid_resolution=5) + plot_data = fig.to_dict()["data"][0] + assert plot_data["type"] == "scatter" + assert plot_data["x"].tolist() == list( + pd.date_range("20200101", periods=X.shape[0]) + ) def test_partial_dependence_respect_grid_resolution(fraud_100): X, y = fraud_100 - pl = BinaryClassificationPipeline(component_graph=["DateTime Featurization Component", "One Hot Encoder", "Random Forest Classifier"]) + pl = BinaryClassificationPipeline( + component_graph=[ + "DateTime Featurization Component", + "One Hot Encoder", + "Random Forest Classifier", + ] + ) pl.fit(X, y) dep = partial_dependence(pl, X, features="amount", grid_resolution=20) assert dep.shape[0] == 20 - assert dep.shape[0] != max(X.ww.select('categorical').describe().loc["unique"]) + 1 + assert dep.shape[0] != max(X.ww.select("categorical").describe().loc["unique"]) + 1 dep = partial_dependence(pl, X, features="provider", grid_resolution=20) - assert dep.shape[0] == X['provider'].nunique() - assert dep.shape[0] != max(X.ww.select('categorical').describe().loc["unique"]) + 1 + assert dep.shape[0] == X["provider"].nunique() + assert dep.shape[0] != max(X.ww.select("categorical").describe().loc["unique"]) + 1 diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index 8772bcd133..0ce95e34c4 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -6,7 +6,7 @@ from evalml.model_understanding.permutation_importance import ( calculate_permutation_importance, - calculate_permutation_importance_one_column + calculate_permutation_importance_one_column, ) from evalml.pipelines import BinaryClassificationPipeline, Transformer from evalml.pipelines.components import ( @@ -14,7 +14,7 @@ DateTimeFeaturizer, DFSTransformer, OneHotEncoder, - TextFeaturizer + TextFeaturizer, ) from evalml.utils import infer_feature_types @@ -26,6 +26,7 @@ class DoubleColumns(Transformer): That being said, I want to test that our implementation can handle that case in the event we add a transformer like that in the future. """ + name = "DoubleColumns" hyperparameter_ranges = {} @@ -53,23 +54,50 @@ def _get_feature_provenance(self): class LinearPipelineWithDropCols(BinaryClassificationPipeline): - component_graph = ['Drop Columns Transformer', OneHotEncoder, DateTimeFeaturizer, 'Random Forest Classifier'] + component_graph = [ + "Drop Columns Transformer", + OneHotEncoder, + DateTimeFeaturizer, + "Random Forest Classifier", + ] class LinearPipelineWithImputer(BinaryClassificationPipeline): - component_graph = ['Imputer', OneHotEncoder, DateTimeFeaturizer, 'Random Forest Classifier'] + component_graph = [ + "Imputer", + OneHotEncoder, + DateTimeFeaturizer, + "Random Forest Classifier", + ] class LinearPipelineSameFeatureUsedByTwoComponents(BinaryClassificationPipeline): - component_graph = ['Imputer', DateTimeFeaturizer, OneHotEncoder, 'Random Forest Classifier'] + component_graph = [ + "Imputer", + DateTimeFeaturizer, + OneHotEncoder, + "Random Forest Classifier", + ] class LinearPipelineTwoEncoders(BinaryClassificationPipeline): - component_graph = ['Imputer', DateTimeFeaturizer, OneHotEncoder, OneHotEncoder, "Random Forest Classifier"] + component_graph = [ + "Imputer", + DateTimeFeaturizer, + OneHotEncoder, + OneHotEncoder, + "Random Forest Classifier", + ] class LinearPipelineWithTextFeatures(BinaryClassificationPipeline): - component_graph = ['Imputer', 'Drop Columns Transformer', TextFeaturizer, OneHotEncoder, 'Random Forest Classifier'] + component_graph = [ + "Imputer", + "Drop Columns Transformer", + TextFeaturizer, + OneHotEncoder, + "Random Forest Classifier", + ] class LinearPipelineWithTextFeaturizerNoTextFeatures(LinearPipelineWithTextFeatures): @@ -77,229 +105,421 @@ class LinearPipelineWithTextFeaturizerNoTextFeatures(LinearPipelineWithTextFeatu class LinearPipelineWithDoubling(BinaryClassificationPipeline): - component_graph = ['Select Columns Transformer', DoubleColumns, DoubleColumns, DoubleColumns, 'Random Forest Classifier'] + component_graph = [ + "Select Columns Transformer", + DoubleColumns, + DoubleColumns, + DoubleColumns, + "Random Forest Classifier", + ] class LinearPipelineWithTargetEncoderAndOHE(BinaryClassificationPipeline): - component_graph = ['Imputer', DateTimeFeaturizer, OneHotEncoder, 'Target Encoder', "Random Forest Classifier"] + component_graph = [ + "Imputer", + DateTimeFeaturizer, + OneHotEncoder, + "Target Encoder", + "Random Forest Classifier", + ] class LinearPipelineCreateFeatureThenDropIt(BinaryClassificationPipeline): - component_graph = ['Select Columns Transformer', DoubleColumns, 'Drop Columns Transformer', 'Random Forest Classifier'] + component_graph = [ + "Select Columns Transformer", + DoubleColumns, + "Drop Columns Transformer", + "Random Forest Classifier", + ] class DagTwoEncoders(BinaryClassificationPipeline): component_graph = { - 'Imputer': ['Imputer'], - 'SelectNumeric': ["Select Columns Transformer", "Imputer"], - 'SelectCategorical1': ["Select Columns Transformer", "Imputer"], - 'SelectCategorical2': ["Select Columns Transformer", "Imputer"], - 'OHE_1': ['One Hot Encoder', 'SelectCategorical1'], - 'OHE_2': ['One Hot Encoder', 'SelectCategorical2'], - 'DT': ['DateTime Featurization Component', "SelectNumeric"], - 'Estimator': ['Random Forest Classifier', 'DT', 'OHE_1', 'OHE_2'], + "Imputer": ["Imputer"], + "SelectNumeric": ["Select Columns Transformer", "Imputer"], + "SelectCategorical1": ["Select Columns Transformer", "Imputer"], + "SelectCategorical2": ["Select Columns Transformer", "Imputer"], + "OHE_1": ["One Hot Encoder", "SelectCategorical1"], + "OHE_2": ["One Hot Encoder", "SelectCategorical2"], + "DT": ["DateTime Featurization Component", "SelectNumeric"], + "Estimator": ["Random Forest Classifier", "DT", "OHE_1", "OHE_2"], } class DagReuseFeatures(BinaryClassificationPipeline): component_graph = { - 'Imputer': ['Imputer'], - 'SelectNumeric': ["Select Columns Transformer", "Imputer"], - 'SelectCategorical1': ["Select Columns Transformer", "Imputer"], - 'SelectCategorical2': ["Select Columns Transformer", "Imputer"], - 'OHE_1': ['One Hot Encoder', 'SelectCategorical1'], - 'OHE_2': ['One Hot Encoder', 'SelectCategorical2'], - 'DT': ['DateTime Featurization Component', "SelectNumeric"], - 'OHE_3': ['One Hot Encoder', 'DT'], - 'Estimator': ['Random Forest Classifier', 'OHE_3', 'OHE_1', 'OHE_2'], + "Imputer": ["Imputer"], + "SelectNumeric": ["Select Columns Transformer", "Imputer"], + "SelectCategorical1": ["Select Columns Transformer", "Imputer"], + "SelectCategorical2": ["Select Columns Transformer", "Imputer"], + "OHE_1": ["One Hot Encoder", "SelectCategorical1"], + "OHE_2": ["One Hot Encoder", "SelectCategorical2"], + "DT": ["DateTime Featurization Component", "SelectNumeric"], + "OHE_3": ["One Hot Encoder", "DT"], + "Estimator": ["Random Forest Classifier", "OHE_3", "OHE_1", "OHE_2"], } -test_cases = [(LinearPipelineWithDropCols, {"Drop Columns Transformer": {'columns': ['country']}}), - (LinearPipelineWithImputer, {}), - (LinearPipelineSameFeatureUsedByTwoComponents, {'DateTime Featurization Component': {'encode_as_categories': True}}), - (LinearPipelineTwoEncoders, {'One Hot Encoder': {'features_to_encode': ['currency', 'expiration_date', 'provider']}, - 'One Hot Encoder_2': {'features_to_encode': ['region', 'country']}}), - (LinearPipelineWithTextFeatures, {'Drop Columns Transformer': {'columns': ['datetime']}}), - (LinearPipelineWithTextFeaturizerNoTextFeatures, {'Drop Columns Transformer': {'columns': ['datetime']}}), - (LinearPipelineWithDoubling, {'Select Columns Transformer': {'columns': ['amount']}}), - (LinearPipelineWithDoubling, {'Select Columns Transformer': {'columns': ['amount']}, - 'DoubleColumns': {'drop_old_columns': False}}), - (DagTwoEncoders, {'SelectNumeric': {'columns': ['card_id', 'store_id', 'datetime', 'amount', 'customer_present', 'lat', 'lng']}, - 'SelectCategorical1': {'columns': ['currency', 'expiration_date', 'provider']}, - 'SelectCategorical2': {'columns': ['region', 'country']}, - 'OHE_1': {'features_to_encode': ['currency', 'expiration_date', 'provider']}, - 'OHE_2': {'features_to_encode': ['region', 'country']}}), - (DagReuseFeatures, {'SelectNumeric': {'columns': ['card_id', 'store_id', 'datetime', 'amount', 'customer_present', 'lat', 'lng']}, - 'SelectCategorical1': {'columns': ['currency', 'expiration_date', 'provider']}, - 'SelectCategorical2': {'columns': ['region', 'country']}, - 'OHE_1': {'features_to_encode': ['currency', 'expiration_date', 'provider']}, - 'OHE_2': {'features_to_encode': ['region', 'country']}, - 'DT': {'encode_as_categories': True}}), - (LinearPipelineWithTargetEncoderAndOHE, {'One Hot Encoder': {'features_to_encode': ['currency', 'expiration_date', 'provider']}, - 'Target Encoder': {'cols': ['region', 'country']}}), - (LinearPipelineCreateFeatureThenDropIt, {'Select Columns Transformer': {'columns': ['amount']}, - 'DoubleColumns': {'drop_old_columns': False}, - 'Drop Columns Transformer': {'columns': ['amount_doubled']}}) - ] - - -@pytest.mark.parametrize('pipeline_class, parameters', test_cases) -@patch('evalml.pipelines.PipelineBase._supports_fast_permutation_importance', new_callable=PropertyMock) -def test_fast_permutation_importance_matches_slow_output(mock_supports_fast_importance, pipeline_class, parameters, - has_minimal_dependencies, fraud_100): - if has_minimal_dependencies and pipeline_class == LinearPipelineWithTargetEncoderAndOHE: - pytest.skip("Skipping test_fast_permutation_importance_matches_sklearn_output for target encoder cause " - "dependency not installed.") +test_cases = [ + ( + LinearPipelineWithDropCols, + {"Drop Columns Transformer": {"columns": ["country"]}}, + ), + (LinearPipelineWithImputer, {}), + ( + LinearPipelineSameFeatureUsedByTwoComponents, + {"DateTime Featurization Component": {"encode_as_categories": True}}, + ), + ( + LinearPipelineTwoEncoders, + { + "One Hot Encoder": { + "features_to_encode": ["currency", "expiration_date", "provider"] + }, + "One Hot Encoder_2": {"features_to_encode": ["region", "country"]}, + }, + ), + ( + LinearPipelineWithTextFeatures, + {"Drop Columns Transformer": {"columns": ["datetime"]}}, + ), + ( + LinearPipelineWithTextFeaturizerNoTextFeatures, + {"Drop Columns Transformer": {"columns": ["datetime"]}}, + ), + ( + LinearPipelineWithDoubling, + {"Select Columns Transformer": {"columns": ["amount"]}}, + ), + ( + LinearPipelineWithDoubling, + { + "Select Columns Transformer": {"columns": ["amount"]}, + "DoubleColumns": {"drop_old_columns": False}, + }, + ), + ( + DagTwoEncoders, + { + "SelectNumeric": { + "columns": [ + "card_id", + "store_id", + "datetime", + "amount", + "customer_present", + "lat", + "lng", + ] + }, + "SelectCategorical1": { + "columns": ["currency", "expiration_date", "provider"] + }, + "SelectCategorical2": {"columns": ["region", "country"]}, + "OHE_1": { + "features_to_encode": ["currency", "expiration_date", "provider"] + }, + "OHE_2": {"features_to_encode": ["region", "country"]}, + }, + ), + ( + DagReuseFeatures, + { + "SelectNumeric": { + "columns": [ + "card_id", + "store_id", + "datetime", + "amount", + "customer_present", + "lat", + "lng", + ] + }, + "SelectCategorical1": { + "columns": ["currency", "expiration_date", "provider"] + }, + "SelectCategorical2": {"columns": ["region", "country"]}, + "OHE_1": { + "features_to_encode": ["currency", "expiration_date", "provider"] + }, + "OHE_2": {"features_to_encode": ["region", "country"]}, + "DT": {"encode_as_categories": True}, + }, + ), + ( + LinearPipelineWithTargetEncoderAndOHE, + { + "One Hot Encoder": { + "features_to_encode": ["currency", "expiration_date", "provider"] + }, + "Target Encoder": {"cols": ["region", "country"]}, + }, + ), + ( + LinearPipelineCreateFeatureThenDropIt, + { + "Select Columns Transformer": {"columns": ["amount"]}, + "DoubleColumns": {"drop_old_columns": False}, + "Drop Columns Transformer": {"columns": ["amount_doubled"]}, + }, + ), +] + + +@pytest.mark.parametrize("pipeline_class, parameters", test_cases) +@patch( + "evalml.pipelines.PipelineBase._supports_fast_permutation_importance", + new_callable=PropertyMock, +) +def test_fast_permutation_importance_matches_slow_output( + mock_supports_fast_importance, + pipeline_class, + parameters, + has_minimal_dependencies, + fraud_100, +): + if ( + has_minimal_dependencies + and pipeline_class == LinearPipelineWithTargetEncoderAndOHE + ): + pytest.skip( + "Skipping test_fast_permutation_importance_matches_sklearn_output for target encoder cause " + "dependency not installed." + ) X, y = fraud_100 if pipeline_class == LinearPipelineWithTextFeatures: - X.ww.set_types(logical_types={'provider': 'NaturalLanguage'}) + X.ww.set_types(logical_types={"provider": "NaturalLanguage"}) mock_supports_fast_importance.return_value = True - parameters['Random Forest Classifier'] = {'n_jobs': 1} + parameters["Random Forest Classifier"] = {"n_jobs": 1} pipeline = pipeline_class(pipeline_class.component_graph, parameters=parameters) pipeline.fit(X, y) - fast_scores = calculate_permutation_importance(pipeline, X, y, objective='Log Loss Binary', - random_seed=0) + fast_scores = calculate_permutation_importance( + pipeline, X, y, objective="Log Loss Binary", random_seed=0 + ) mock_supports_fast_importance.return_value = False - slow_scores = calculate_permutation_importance(pipeline, X, y, objective='Log Loss Binary', - random_seed=0) + slow_scores = calculate_permutation_importance( + pipeline, X, y, objective="Log Loss Binary", random_seed=0 + ) pd.testing.assert_frame_equal(fast_scores, slow_scores) precomputed_features = pipeline.compute_estimator_features(X, y) for col in X.columns: mock_supports_fast_importance.return_value = True - permutation_importance_one_col_fast = calculate_permutation_importance_one_column(pipeline, X, y, col, 'Log Loss Binary', fast=True, precomputed_features=precomputed_features) + permutation_importance_one_col_fast = ( + calculate_permutation_importance_one_column( + pipeline, + X, + y, + col, + "Log Loss Binary", + fast=True, + precomputed_features=precomputed_features, + ) + ) mock_supports_fast_importance.return_value = False - permutation_importance_one_col_slow = calculate_permutation_importance_one_column(pipeline, X, y, col, 'Log Loss Binary', fast=False) - np.testing.assert_almost_equal(permutation_importance_one_col_fast, permutation_importance_one_col_slow) + permutation_importance_one_col_slow = ( + calculate_permutation_importance_one_column( + pipeline, X, y, col, "Log Loss Binary", fast=False + ) + ) + np.testing.assert_almost_equal( + permutation_importance_one_col_fast, permutation_importance_one_col_slow + ) class PipelineWithDimReduction(BinaryClassificationPipeline): - component_graph = [PCA, 'Logistic Regression Classifier'] + component_graph = [PCA, "Logistic Regression Classifier"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) class EnsembleDag(BinaryClassificationPipeline): component_graph = { - 'Imputer_1': ['Imputer'], - 'Imputer_2': ['Imputer'], - 'OHE_1': ['One Hot Encoder', 'Imputer_1'], - 'OHE_2': ['One Hot Encoder', 'Imputer_2'], - 'DT_1': ['DateTime Featurization Component', 'OHE_1'], - 'DT_2': ['DateTime Featurization Component', 'OHE_2'], - 'Estimator_1': ['Random Forest Classifier', 'DT_1'], - 'Estimator_2': ['Extra Trees Classifier', 'DT_2'], - 'Ensembler': ['Logistic Regression Classifier', 'Estimator_1', 'Estimator_2'] + "Imputer_1": ["Imputer"], + "Imputer_2": ["Imputer"], + "OHE_1": ["One Hot Encoder", "Imputer_1"], + "OHE_2": ["One Hot Encoder", "Imputer_2"], + "DT_1": ["DateTime Featurization Component", "OHE_1"], + "DT_2": ["DateTime Featurization Component", "OHE_2"], + "Estimator_1": ["Random Forest Classifier", "DT_1"], + "Estimator_2": ["Extra Trees Classifier", "DT_2"], + "Ensembler": ["Logistic Regression Classifier", "Estimator_1", "Estimator_2"], } def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) class PipelineWithDFS(BinaryClassificationPipeline): - component_graph = [DFSTransformer, 'Logistic Regression Classifier'] + component_graph = [DFSTransformer, "Logistic Regression Classifier"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) class PipelineWithCustomComponent(BinaryClassificationPipeline): - component_graph = [DoubleColumns, 'Logistic Regression Classifier'] + component_graph = [DoubleColumns, "Logistic Regression Classifier"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) class StackedEnsemblePipeline(BinaryClassificationPipeline): - component_graph = ['Stacked Ensemble Classifier'] + component_graph = ["Stacked Ensemble Classifier"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) -pipelines_that_do_not_support_fast_permutation_importance = [PipelineWithDimReduction, - PipelineWithDFS, - PipelineWithCustomComponent, - EnsembleDag, StackedEnsemblePipeline] +pipelines_that_do_not_support_fast_permutation_importance = [ + PipelineWithDimReduction, + PipelineWithDFS, + PipelineWithCustomComponent, + EnsembleDag, + StackedEnsemblePipeline, +] -@pytest.mark.parametrize('pipeline_class', pipelines_that_do_not_support_fast_permutation_importance) +@pytest.mark.parametrize( + "pipeline_class", pipelines_that_do_not_support_fast_permutation_importance +) def test_supports_fast_permutation_importance(pipeline_class): - params = {'Stacked Ensemble Classifier': {'input_pipelines': [PipelineWithDFS({})]}} + params = {"Stacked Ensemble Classifier": {"input_pipelines": [PipelineWithDFS({})]}} assert not pipeline_class(params)._supports_fast_permutation_importance -def test_get_permutation_importance_invalid_objective(X_y_regression, linear_regression_pipeline_class): +def test_get_permutation_importance_invalid_objective( + X_y_regression, linear_regression_pipeline_class +): X, y = X_y_regression pipeline = linear_regression_pipeline_class(parameters={}, random_seed=42) - with pytest.raises(ValueError, match=f"Given objective 'MCC Multiclass' cannot be used with '{pipeline.name}'"): + with pytest.raises( + ValueError, + match=f"Given objective 'MCC Multiclass' cannot be used with '{pipeline.name}'", + ): calculate_permutation_importance(pipeline, X, y, "mcc multiclass") -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) -def test_get_permutation_importance_binary(X_y_binary, data_type, logistic_regression_binary_pipeline_class, - binary_core_objectives, make_data_type): +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) +def test_get_permutation_importance_binary( + X_y_binary, + data_type, + logistic_regression_binary_pipeline_class, + binary_core_objectives, + make_data_type, +): X, y = X_y_binary X = make_data_type(data_type, X) y = make_data_type(data_type, y) - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, - random_seed=42) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_seed=42 + ) pipeline.fit(X, y) for objective in binary_core_objectives: - permutation_importance = calculate_permutation_importance(pipeline, X, y, objective) + permutation_importance = calculate_permutation_importance( + pipeline, X, y, objective + ) assert list(permutation_importance.columns) == ["feature", "importance"] assert not permutation_importance.isnull().all().all() - permutation_importance_sorted = permutation_importance.sort_values('feature', ascending=True).reset_index(drop=True) + permutation_importance_sorted = permutation_importance.sort_values( + "feature", ascending=True + ).reset_index(drop=True) X = pd.DataFrame(X) for col in X.columns: - permutation_importance_one_col = calculate_permutation_importance_one_column(pipeline, X, y, col, objective, fast=False) - np.testing.assert_almost_equal(permutation_importance_sorted["importance"][col], permutation_importance_one_col) - - -def test_get_permutation_importance_multiclass(X_y_multi, logistic_regression_multiclass_pipeline_class, - multiclass_core_objectives): + permutation_importance_one_col = ( + calculate_permutation_importance_one_column( + pipeline, X, y, col, objective, fast=False + ) + ) + np.testing.assert_almost_equal( + permutation_importance_sorted["importance"][col], + permutation_importance_one_col, + ) + + +def test_get_permutation_importance_multiclass( + X_y_multi, logistic_regression_multiclass_pipeline_class, multiclass_core_objectives +): X, y = X_y_multi X = pd.DataFrame(X) - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, - random_seed=42) + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_seed=42 + ) pipeline.fit(X, y) for objective in multiclass_core_objectives: - permutation_importance = calculate_permutation_importance(pipeline, X, y, objective) + permutation_importance = calculate_permutation_importance( + pipeline, X, y, objective + ) assert list(permutation_importance.columns) == ["feature", "importance"] assert not permutation_importance.isnull().all().all() - permutation_importance_sorted = permutation_importance.sort_values('feature', ascending=True).reset_index(drop=True) + permutation_importance_sorted = permutation_importance.sort_values( + "feature", ascending=True + ).reset_index(drop=True) for col in X.columns: - permutation_importance_one_col = calculate_permutation_importance_one_column(pipeline, X, y, col, objective, fast=False) - np.testing.assert_almost_equal(permutation_importance_sorted["importance"][col], permutation_importance_one_col) - - -def test_get_permutation_importance_regression(linear_regression_pipeline_class, regression_core_objectives): + permutation_importance_one_col = ( + calculate_permutation_importance_one_column( + pipeline, X, y, col, objective, fast=False + ) + ) + np.testing.assert_almost_equal( + permutation_importance_sorted["importance"][col], + permutation_importance_one_col, + ) + + +def test_get_permutation_importance_regression( + linear_regression_pipeline_class, regression_core_objectives +): X = pd.DataFrame([1, 2, 1, 2, 1, 2, 1, 2, 1, 2]) y = pd.Series([1, 2, 1, 2, 1, 2, 1, 2, 1, 2]) - pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}, - random_seed=42) + pipeline = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}}, random_seed=42 + ) pipeline.fit(X, y) for objective in regression_core_objectives: - permutation_importance = calculate_permutation_importance(pipeline, X, y, objective) + permutation_importance = calculate_permutation_importance( + pipeline, X, y, objective + ) assert list(permutation_importance.columns) == ["feature", "importance"] assert not permutation_importance.isnull().all().all() - permutation_importance_sorted = permutation_importance.sort_values('feature', ascending=True).reset_index(drop=True) + permutation_importance_sorted = permutation_importance.sort_values( + "feature", ascending=True + ).reset_index(drop=True) for col in X.columns: - permutation_importance_one_col = calculate_permutation_importance_one_column(pipeline, X, y, col, objective, fast=False) - np.testing.assert_almost_equal(permutation_importance_sorted["importance"][col], permutation_importance_one_col) - - -def test_get_permutation_importance_correlated_features(logistic_regression_binary_pipeline_class): + permutation_importance_one_col = ( + calculate_permutation_importance_one_column( + pipeline, X, y, col, objective, fast=False + ) + ) + np.testing.assert_almost_equal( + permutation_importance_sorted["importance"][col], + permutation_importance_one_col, + ) + + +def test_get_permutation_importance_correlated_features( + logistic_regression_binary_pipeline_class, +): y = pd.Series([1, 0, 1, 1]) X = pd.DataFrame() X["correlated"] = y * 2 @@ -307,11 +527,17 @@ def test_get_permutation_importance_correlated_features(logistic_regression_bina y = y.astype(bool) pipeline = logistic_regression_binary_pipeline_class(parameters={}, random_seed=42) pipeline.fit(X, y) - importance = calculate_permutation_importance(pipeline, X, y, objective="Log Loss Binary", random_seed=0) + importance = calculate_permutation_importance( + pipeline, X, y, objective="Log Loss Binary", random_seed=0 + ) assert list(importance.columns) == ["feature", "importance"] assert not importance.isnull().all().all() - correlated_importance_val = importance["importance"][importance.index[importance["feature"] == "correlated"][0]] - not_correlated_importance_val = importance["importance"][importance.index[importance["feature"] == "not correlated"][0]] + correlated_importance_val = importance["importance"][ + importance.index[importance["feature"] == "correlated"][0] + ] + not_correlated_importance_val = importance["importance"][ + importance.index[importance["feature"] == "not correlated"][0] + ] assert correlated_importance_val > not_correlated_importance_val @@ -324,7 +550,9 @@ def test_undersampler(X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) y = pd.Series(y) - pipeline = BinaryClassificationPipeline(component_graph=["Undersampler", "Elastic Net Classifier"]) + pipeline = BinaryClassificationPipeline( + component_graph=["Undersampler", "Elastic Net Classifier"] + ) pipeline.fit(X=X, y=y) pipeline.predict(X) test = calculate_permutation_importance(pipeline, X, y, objective="Log Loss Binary") @@ -332,27 +560,57 @@ def test_undersampler(X_y_binary): def test_permutation_importance_oversampler(fraud_100): - pytest.importorskip('imblearn.over_sampling', reason='Skipping test because imbalanced-learn not installed') + pytest.importorskip( + "imblearn.over_sampling", + reason="Skipping test because imbalanced-learn not installed", + ) X, y = fraud_100 - pipeline = BinaryClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "DateTime Featurization Component", "SMOTENC Oversampler", "Decision Tree Classifier"]) + pipeline = BinaryClassificationPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "DateTime Featurization Component", + "SMOTENC Oversampler", + "Decision Tree Classifier", + ] + ) pipeline.fit(X=X, y=y) pipeline.predict(X) - importance = calculate_permutation_importance(pipeline, X, y, objective="Log Loss Binary") + importance = calculate_permutation_importance( + pipeline, X, y, objective="Log Loss Binary" + ) assert not importance.isnull().all().all() -def test_get_permutation_importance_one_column_fast_no_precomputed_features(X_y_binary, logistic_regression_binary_pipeline_class): +def test_get_permutation_importance_one_column_fast_no_precomputed_features( + X_y_binary, logistic_regression_binary_pipeline_class +): X, y = X_y_binary - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, - random_seed=42) - with pytest.raises(ValueError, match="Fast method of calculating permutation importance requires precomputed_features"): - calculate_permutation_importance_one_column(pipeline, X, y, 0, "log loss binary", fast=True) - - -@pytest.mark.parametrize('pipeline_class', pipelines_that_do_not_support_fast_permutation_importance) -def test_get_permutation_importance_one_column_pipeline_does_not_support_fast(X_y_binary, pipeline_class): + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_seed=42 + ) + with pytest.raises( + ValueError, + match="Fast method of calculating permutation importance requires precomputed_features", + ): + calculate_permutation_importance_one_column( + pipeline, X, y, 0, "log loss binary", fast=True + ) + + +@pytest.mark.parametrize( + "pipeline_class", pipelines_that_do_not_support_fast_permutation_importance +) +def test_get_permutation_importance_one_column_pipeline_does_not_support_fast( + X_y_binary, pipeline_class +): X, y = X_y_binary - params = {'Stacked Ensemble Classifier': {'input_pipelines': [PipelineWithDFS({})]}} + params = {"Stacked Ensemble Classifier": {"input_pipelines": [PipelineWithDFS({})]}} assert not pipeline_class(params)._supports_fast_permutation_importance - with pytest.raises(ValueError, match="Pipeline does not support fast permutation importance calculation"): - calculate_permutation_importance_one_column(pipeline_class(params), X, y, 0, "log loss binary", fast=True) + with pytest.raises( + ValueError, + match="Pipeline does not support fast permutation importance calculation", + ): + calculate_permutation_importance_one_column( + pipeline_class(params), X, y, 0, "log loss binary", fast=True + ) diff --git a/evalml/tests/objective_tests/test_binary_classification_objective.py b/evalml/tests/objective_tests/test_binary_classification_objective.py index 6209c10d86..5c009e266a 100644 --- a/evalml/tests/objective_tests/test_binary_classification_objective.py +++ b/evalml/tests/objective_tests/test_binary_classification_objective.py @@ -12,7 +12,9 @@ def test_optimize_threshold(): ypred_proba = np.array([0.2, 0.4]) y_true = np.array([0, 1]) obj = F1() - np.random.seed(42) # unfortunately scipy.optimize.minimize_scalar has no ability to accept seed as input + np.random.seed( + 42 + ) # unfortunately scipy.optimize.minimize_scalar has no ability to accept seed as input threshold = obj.optimize_threshold(ypred_proba, y_true) assert 0.2 < threshold and threshold < 0.4 @@ -22,7 +24,9 @@ def test_optimize_threshold_neg(): y_true = np.array([0, 1]) obj = AUC() np.random.seed(0) - with pytest.raises(RuntimeError, match="Trying to optimize objective that can't be optimized!"): + with pytest.raises( + RuntimeError, match="Trying to optimize objective that can't be optimized!" + ): obj.optimize_threshold(ypred_proba, y_true) @@ -34,14 +38,22 @@ def test_can_optimize_threshold(): def test_decision_function(): ypred_proba = np.arange(6) / 5.0 obj = F1() - pd.testing.assert_series_equal(obj.decision_function(ypred_proba), - pd.Series(np.array([0] * 3 + [1] * 3), dtype=bool)) - pd.testing.assert_series_equal(obj.decision_function(ypred_proba, threshold=0.5), - pd.Series(np.array([0] * 3 + [1] * 3), dtype=bool)) - pd.testing.assert_series_equal(obj.decision_function(ypred_proba, threshold=0.0), - pd.Series(np.array([0] + [1] * 5, dtype=int), dtype=bool)) - pd.testing.assert_series_equal(obj.decision_function(ypred_proba, threshold=1.0), - pd.Series(np.array([0] * 6, dtype=int), dtype=bool)) + pd.testing.assert_series_equal( + obj.decision_function(ypred_proba), + pd.Series(np.array([0] * 3 + [1] * 3), dtype=bool), + ) + pd.testing.assert_series_equal( + obj.decision_function(ypred_proba, threshold=0.5), + pd.Series(np.array([0] * 3 + [1] * 3), dtype=bool), + ) + pd.testing.assert_series_equal( + obj.decision_function(ypred_proba, threshold=0.0), + pd.Series(np.array([0] + [1] * 5, dtype=int), dtype=bool), + ) + pd.testing.assert_series_equal( + obj.decision_function(ypred_proba, threshold=1.0), + pd.Series(np.array([0] * 6, dtype=int), dtype=bool), + ) def test_decision_function_neg(): @@ -49,23 +61,30 @@ def test_decision_function_neg(): y_true = pd.Series(np.array([0] * 3 + [1] * 3), dtype=bool) obj = F1() pd.testing.assert_series_equal(obj.decision_function(ypred_proba), y_true) - pd.testing.assert_series_equal(obj.decision_function(pd.Series(ypred_proba, dtype=float)), y_true) + pd.testing.assert_series_equal( + obj.decision_function(pd.Series(ypred_proba, dtype=float)), y_true + ) class TestBinaryObjective(metaclass=ABCMeta): __test__ = False def assign_problem_type(self): - self.problem_type = 'binary' + self.problem_type = "binary" @abstractmethod def assign_objective(self, **kwargs): - """Get objective object using specified parameters - """ + """Get objective object using specified parameters""" def run_pipeline(self, X_y_binary, **kwargs): self.X, self.y = X_y_binary - automl = AutoMLSearch(X_train=self.X, y_train=self.y, problem_type=self.problem_type, objective=self.objective, max_iterations=1) + automl = AutoMLSearch( + X_train=self.X, + y_train=self.y, + problem_type=self.problem_type, + objective=self.objective, + max_iterations=1, + ) automl.search() pipeline = automl.best_pipeline @@ -86,26 +105,25 @@ def test_score(self, y_true, y_predicted, expected_score): @abstractmethod def test_all_base_tests(self): - """Run all relevant tests from the base class - """ + """Run all relevant tests from the base class""" - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def fix_y_pred_na(self): return np.array([np.nan, 0, 0]) - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def fix_y_true(self): return np.array([1, 2, 1]) - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def fix_y_pred_diff_len(self): return np.array([0, 1]) - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def fix_empty_array(self): return np.array([]) - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def fix_y_pred_multi(self): return np.array([0, 1, 2]) @@ -122,5 +140,7 @@ def zero_input_lengths(self, fix_empty_array): self.objective.score(fix_empty_array, fix_empty_array) def binary_more_than_two_unique_values(self, fix_y_pred_multi, fix_y_true): - with pytest.raises(ValueError, match="y_predicted contains more than two unique values"): + with pytest.raises( + ValueError, match="y_predicted contains more than two unique values" + ): self.objective.score(fix_y_true, fix_y_pred_multi) diff --git a/evalml/tests/objective_tests/test_cost_benefit_matrix.py b/evalml/tests/objective_tests/test_cost_benefit_matrix.py index 1786856af6..04a15452d6 100644 --- a/evalml/tests/objective_tests/test_cost_benefit_matrix.py +++ b/evalml/tests/objective_tests/test_cost_benefit_matrix.py @@ -7,26 +7,46 @@ def test_cbm_init(): - with pytest.raises(ValueError, match="Parameters to CostBenefitMatrix must all be numeric values."): - CostBenefitMatrix(true_positive=None, true_negative=-1, - false_positive=-7, false_negative=-2) - with pytest.raises(ValueError, match="Parameters to CostBenefitMatrix must all be numeric values."): - CostBenefitMatrix(true_positive=1, true_negative=-1, - false_positive=None, false_negative=-2) - with pytest.raises(ValueError, match="Parameters to CostBenefitMatrix must all be numeric values."): - CostBenefitMatrix(true_positive=1, true_negative=None, - false_positive=-7, false_negative=-2) - with pytest.raises(ValueError, match="Parameters to CostBenefitMatrix must all be numeric values."): - CostBenefitMatrix(true_positive=3, true_negative=-1, - false_positive=-7, false_negative=None) + with pytest.raises( + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ): + CostBenefitMatrix( + true_positive=None, true_negative=-1, false_positive=-7, false_negative=-2 + ) + with pytest.raises( + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ): + CostBenefitMatrix( + true_positive=1, true_negative=-1, false_positive=None, false_negative=-2 + ) + with pytest.raises( + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ): + CostBenefitMatrix( + true_positive=1, true_negative=None, false_positive=-7, false_negative=-2 + ) + with pytest.raises( + ValueError, match="Parameters to CostBenefitMatrix must all be numeric values." + ): + CostBenefitMatrix( + true_positive=3, true_negative=-1, false_positive=-7, false_negative=None + ) @pytest.mark.parametrize("optimize_thresholds", [True, False]) def test_cbm_objective_automl(optimize_thresholds, X_y_binary): X, y = X_y_binary - cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, - false_positive=-7, false_negative=-2) - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=cbm, max_iterations=2, optimize_thresholds=optimize_thresholds) + cbm = CostBenefitMatrix( + true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2 + ) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=cbm, + max_iterations=2, + optimize_thresholds=optimize_thresholds, + ) automl.search() pipeline = automl.best_pipeline @@ -34,33 +54,42 @@ def test_cbm_objective_automl(optimize_thresholds, X_y_binary): predictions = pipeline.predict(X, cbm) assert not np.isnan(predictions).values.any() assert not np.isnan(pipeline.predict_proba(X)).values.any() - assert not np.isnan(pipeline.score(X, y, [cbm])['Cost Benefit Matrix']) + assert not np.isnan(pipeline.score(X, y, [cbm])["Cost Benefit Matrix"]) -@pytest.mark.parametrize("data_type", ['ww', 'pd']) +@pytest.mark.parametrize("data_type", ["ww", "pd"]) def test_cbm_objective_function(data_type, make_data_type): y_true = pd.Series([0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) y_predicted = pd.Series([0, 0, 1, 0, 0, 0, 0, 1, 1, 1]) y_true = make_data_type(data_type, y_true) y_predicted = make_data_type(data_type, y_predicted) - cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, - false_positive=-7, false_negative=-2) - assert np.isclose(cbm.objective_function(y_true, y_predicted), ((3 * 10) + (-1 * 2) + (1 * -7) + (4 * -2)) / 10) + cbm = CostBenefitMatrix( + true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2 + ) + assert np.isclose( + cbm.objective_function(y_true, y_predicted), + ((3 * 10) + (-1 * 2) + (1 * -7) + (4 * -2)) / 10, + ) def test_cbm_objective_function_floats(): y_true = pd.Series([0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) y_predicted = pd.Series([0, 0, 1, 0, 0, 0, 0, 1, 1, 1]) - cbm = CostBenefitMatrix(true_positive=5.1, true_negative=-1.2, - false_positive=-6.7, false_negative=-0.1) - assert np.isclose(cbm.objective_function(y_true, y_predicted), ((3 * 5.1) + (-1.2 * 2) + (1 * -6.7) + (4 * -0.1)) / 10) + cbm = CostBenefitMatrix( + true_positive=5.1, true_negative=-1.2, false_positive=-6.7, false_negative=-0.1 + ) + assert np.isclose( + cbm.objective_function(y_true, y_predicted), + ((3 * 5.1) + (-1.2 * 2) + (1 * -6.7) + (4 * -0.1)) / 10, + ) def test_cbm_input_contains_nan(X_y_binary): y_predicted = pd.Series([np.nan, 0, 0]) y_true = pd.Series([1, 2, 1]) - cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2 + ) with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): cbm.score(y_true, y_predicted) @@ -71,8 +100,9 @@ def test_cbm_input_contains_nan(X_y_binary): def test_cbm_input_contains_inf(capsys): - cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2 + ) y_predicted = np.array([np.inf, 0, 0]) y_true = np.array([1, 0, 0]) with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): @@ -85,8 +115,9 @@ def test_cbm_input_contains_inf(capsys): def test_cbm_different_input_lengths(): - cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2 + ) y_predicted = pd.Series([0, 0]) y_true = pd.Series([1]) with pytest.raises(ValueError, match="Inputs have mismatched dimensions"): @@ -99,8 +130,9 @@ def test_cbm_different_input_lengths(): def test_cbm_zero_input_lengths(): - cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2 + ) y_predicted = pd.Series([]) y_true = pd.Series([]) with pytest.raises(ValueError, match="Length of inputs is 0"): @@ -108,11 +140,14 @@ def test_cbm_zero_input_lengths(): def test_cbm_binary_more_than_two_unique_values(): - cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, - false_positive=-7, false_negative=-2) + cbm = CostBenefitMatrix( + true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2 + ) y_predicted = pd.Series([0, 1, 2]) y_true = pd.Series([1, 0, 1]) - with pytest.raises(ValueError, match="y_predicted contains more than two unique values"): + with pytest.raises( + ValueError, match="y_predicted contains more than two unique values" + ): cbm.score(y_true, y_predicted) y_true = pd.Series([0, 1, 2]) diff --git a/evalml/tests/objective_tests/test_fraud_detection.py b/evalml/tests/objective_tests/test_fraud_detection.py index 84ac13c688..0a83d6bf38 100644 --- a/evalml/tests/objective_tests/test_fraud_detection.py +++ b/evalml/tests/objective_tests/test_fraud_detection.py @@ -9,12 +9,20 @@ def test_fraud_objective(X_y_binary): X, y = X_y_binary - objective = FraudCost(retry_percentage=.5, - interchange_fee=.02, - fraud_payout_percentage=.75, - amount_col=10) - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=objective, max_iterations=1) + objective = FraudCost( + retry_percentage=0.5, + interchange_fee=0.02, + fraud_payout_percentage=0.75, + amount_col=10, + ) + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=objective, + max_iterations=1, + ) automl.search() pipeline = automl.best_pipeline @@ -27,16 +35,22 @@ def test_fraud_objective(X_y_binary): def test_fraud_objective_function_amount_col(X_y_binary): X, y = X_y_binary - objective = FraudCost(retry_percentage=.5, - interchange_fee=.02, - fraud_payout_percentage=.75, - amount_col="this column does not exist") - y_predicted = pd.Series([.1, .5, .5]) + objective = FraudCost( + retry_percentage=0.5, + interchange_fee=0.02, + fraud_payout_percentage=0.75, + amount_col="this column does not exist", + ) + y_predicted = pd.Series([0.1, 0.5, 0.5]) y_true = [True, False, True] - with pytest.raises(ValueError, match="`this column does not exist` is not a valid column in X."): + with pytest.raises( + ValueError, match="`this column does not exist` is not a valid column in X." + ): objective.objective_function(y_true, y_predicted, X) - with pytest.raises(ValueError, match="`this column does not exist` is not a valid column in X."): + with pytest.raises( + ValueError, match="`this column does not exist` is not a valid column in X." + ): objective.objective_function(y_true, y_predicted, X.tolist()) @@ -91,7 +105,9 @@ def test_binary_more_than_two_unique_values(): fraud_cost = FraudCost(amount_col="value") y_predicted = np.array([0, 1, 2]) y_true = np.array([1, 0, 1]) - with pytest.raises(ValueError, match="y_predicted contains more than two unique values"): + with pytest.raises( + ValueError, match="y_predicted contains more than two unique values" + ): fraud_cost.score(y_true, y_predicted) y_true = np.array([0, 1, 2]) @@ -104,7 +120,7 @@ def test_fraud_objective_score(X_y_binary): X, y = X_y_binary fraud_cost = FraudCost(amount_col="value") - y_predicted = pd.Series([.1, .5, .5]) + y_predicted = pd.Series([0.1, 0.5, 0.5]) y_true = pd.Series([True, False, True]) extra_columns = pd.DataFrame({"value": [100, 5, 250]}) @@ -112,42 +128,42 @@ def test_fraud_objective_score(X_y_binary): assert isinstance(out, pd.Series) pd.testing.assert_series_equal(out, y_true, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) - assert (score == 0.0) + assert score == 0.0 out = fraud_cost.decision_function(y_predicted.to_numpy(), 5, extra_columns) assert isinstance(out, pd.Series) pd.testing.assert_series_equal(out, y_true, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) - assert (score == 0.0) + assert score == 0.0 out = fraud_cost.decision_function(y_predicted, 5, extra_columns) pd.testing.assert_series_equal(out, y_true, check_dtype=False, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) - assert (score == 0.0) + assert score == 0.0 # testing with other types of inputs - y_predicted = np.array([.1, .5, .5]) + y_predicted = np.array([0.1, 0.5, 0.5]) extra_columns = pd.DataFrame({"value": [100, 5, 250]}) out = fraud_cost.decision_function(y_predicted, 5, extra_columns) pd.testing.assert_series_equal(out, y_true, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) - assert (score == 0.0) + assert score == 0.0 - y_predicted = pd.Series([.2, .01, .01]) + y_predicted = pd.Series([0.2, 0.01, 0.01]) extra_columns = pd.DataFrame({"value": [100, 50, 50]}) y_true = pd.Series([False, False, True]) expected_y_pred = pd.Series([True, False, False]) out = fraud_cost.decision_function(y_predicted, 10, extra_columns) pd.testing.assert_series_equal(out, expected_y_pred, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) - assert (score == 0.255) + assert score == 0.255 def test_fraud_objective_score_list(X_y_binary): X, y = X_y_binary fraud_cost = FraudCost(amount_col="value") - y_predicted = [.1, .5, .5] + y_predicted = [0.1, 0.5, 0.5] y_true = [True, False, True] extra_columns = pd.DataFrame({"value": [100, 5, 250]}) @@ -155,4 +171,4 @@ def test_fraud_objective_score_list(X_y_binary): assert isinstance(out, pd.Series) pd.testing.assert_series_equal(out, pd.Series(y_true), check_names=False) score = fraud_cost.score(y_true, out, extra_columns) - assert (score == 0.0) + assert score == 0.0 diff --git a/evalml/tests/objective_tests/test_lead_scoring.py b/evalml/tests/objective_tests/test_lead_scoring.py index 20bf63049b..b2810ad829 100644 --- a/evalml/tests/objective_tests/test_lead_scoring.py +++ b/evalml/tests/objective_tests/test_lead_scoring.py @@ -10,11 +10,16 @@ def test_lead_scoring_works_during_automl_search(X_y_binary): X, y = X_y_binary - objective = LeadScoring(true_positives=1, - false_positives=-1) - - automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=objective, max_iterations=1, - random_seed=0) + objective = LeadScoring(true_positives=1, false_positives=-1) + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + objective=objective, + max_iterations=1, + random_seed=0, + ) automl.search() pipeline = automl.best_pipeline pipeline.fit(X, y) @@ -25,25 +30,23 @@ def test_lead_scoring_works_during_automl_search(X_y_binary): def test_lead_scoring_objective(): - objective = LeadScoring(true_positives=1, - false_positives=-1) + objective = LeadScoring(true_positives=1, false_positives=-1) - predicted = pd.Series([1, 10, .5, 5]) + predicted = pd.Series([1, 10, 0.5, 5]) out = objective.decision_function(predicted, 1) y_true = pd.Series([False, True, False, True]) assert out.tolist() == [False, True, False, True] - predicted = np.array([1, 10, .5, 5]) + predicted = np.array([1, 10, 0.5, 5]) out = objective.decision_function(predicted, 1) assert out.tolist() == y_true.to_list() score = objective.score(out, y_true) - assert (score == 0.5) + assert score == 0.5 def test_input_contains_nan(X_y_binary): - objective = LeadScoring(true_positives=1, - false_positives=-1) + objective = LeadScoring(true_positives=1, false_positives=-1) y_predicted = np.array([np.nan, 0, 0]) y_true = np.array([1, 2, 1]) with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): @@ -56,8 +59,7 @@ def test_input_contains_nan(X_y_binary): def test_input_contains_inf(capsys): - objective = LeadScoring(true_positives=1, - false_positives=-1) + objective = LeadScoring(true_positives=1, false_positives=-1) y_predicted = np.array([np.inf, 0, 0]) y_true = np.array([1, 0, 0]) with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): @@ -70,8 +72,7 @@ def test_input_contains_inf(capsys): def test_different_input_lengths(): - objective = LeadScoring(true_positives=1, - false_positives=-1) + objective = LeadScoring(true_positives=1, false_positives=-1) y_predicted = np.array([0, 0]) y_true = np.array([1]) with pytest.raises(ValueError, match="Inputs have mismatched dimensions"): @@ -84,8 +85,7 @@ def test_different_input_lengths(): def test_zero_input_lengths(): - objective = LeadScoring(true_positives=1, - false_positives=-1) + objective = LeadScoring(true_positives=1, false_positives=-1) y_predicted = np.array([]) y_true = np.array([]) with pytest.raises(ValueError, match="Length of inputs is 0"): @@ -93,11 +93,12 @@ def test_zero_input_lengths(): def test_binary_more_than_two_unique_values(): - objective = LeadScoring(true_positives=1, - false_positives=-1) + objective = LeadScoring(true_positives=1, false_positives=-1) y_predicted = np.array([0, 1, 2]) y_true = np.array([1, 0, 1]) - with pytest.raises(ValueError, match="y_predicted contains more than two unique values"): + with pytest.raises( + ValueError, match="y_predicted contains more than two unique values" + ): objective.score(y_true, y_predicted) y_true = np.array([0, 1, 2]) diff --git a/evalml/tests/objective_tests/test_objectives.py b/evalml/tests/objective_tests/test_objectives.py index 70d359a557..00c73f6a78 100644 --- a/evalml/tests/objective_tests/test_objectives.py +++ b/evalml/tests/objective_tests/test_objectives.py @@ -13,7 +13,7 @@ get_core_objective_names, get_core_objectives, get_non_core_objectives, - get_objective + get_objective, ) from evalml.objectives.objective_base import ObjectiveBase from evalml.objectives.utils import _all_objectives_dict @@ -47,7 +47,6 @@ def test_get_objective_works_for_names_of_defined_objectives(obj): def test_get_objective_does_raises_error_for_incorrect_name_or_random_class(): - class InvalidObjective: pass @@ -62,7 +61,10 @@ class InvalidObjective: def test_get_objective_return_instance_does_not_work_for_some_objectives(): - with pytest.raises(ObjectiveCreationError, match="In get_objective, cannot pass in return_instance=True for Cost Benefit Matrix"): + with pytest.raises( + ObjectiveCreationError, + match="In get_objective, cannot pass in return_instance=True for Cost Benefit Matrix", + ): get_objective("Cost Benefit Matrix", return_instance=True) cbm = CostBenefitMatrix(0, 0, 0, 0) @@ -76,15 +78,23 @@ def test_get_objective_does_not_work_for_none_type(): def test_get_objective_kwargs(): - obj = get_objective("cost benefit matrix", return_instance=True, - true_positive=0, true_negative=0, false_positive=0, false_negative=0) + obj = get_objective( + "cost benefit matrix", + return_instance=True, + true_positive=0, + true_negative=0, + false_positive=0, + false_negative=0, + ) assert isinstance(obj, CostBenefitMatrix) def test_can_get_only_core_and_all_objective_names(): all_objective_names = get_all_objective_names() core_objective_names = get_core_objective_names() - assert set(all_objective_names).difference(core_objective_names) == {c.name.lower() for c in get_non_core_objectives()} + assert set(all_objective_names).difference(core_objective_names) == { + c.name.lower() for c in get_non_core_objectives() + } def test_get_core_objectives_types(): @@ -98,9 +108,13 @@ def test_get_time_series_objectives_types(time_series_objectives): assert len(time_series_objectives) == 10 -def test_objective_outputs(X_y_binary, X_y_multi, binary_core_objectives, - multiclass_core_objectives, - regression_core_objectives): +def test_objective_outputs( + X_y_binary, + X_y_multi, + binary_core_objectives, + multiclass_core_objectives, + regression_core_objectives, +): _, y_binary_np = X_y_binary assert isinstance(y_binary_np, np.ndarray) _, y_multi_np = X_y_multi @@ -109,27 +123,41 @@ def test_objective_outputs(X_y_binary, X_y_multi, binary_core_objectives, y_pred_multi_np = y_multi_np # convert to a simulated predicted probability, which must range between 0 and 1 classes = np.unique(y_multi_np) - y_pred_proba_multi_np = np.concatenate([(y_multi_np == val).astype(float).reshape(-1, 1) for val in classes], axis=1) + y_pred_proba_multi_np = np.concatenate( + [(y_multi_np == val).astype(float).reshape(-1, 1) for val in classes], axis=1 + ) - all_objectives = binary_core_objectives + regression_core_objectives + multiclass_core_objectives + all_objectives = ( + binary_core_objectives + regression_core_objectives + multiclass_core_objectives + ) for objective in all_objectives: - print('Testing objective {}'.format(objective.name)) + print("Testing objective {}".format(objective.name)) expected_value = 1.0 if objective.greater_is_better else 0.0 if isinstance(objective, (RegressionObjective, BinaryClassificationObjective)): - np.testing.assert_almost_equal(objective.score(y_binary_np, y_binary_np), expected_value) - np.testing.assert_almost_equal(objective.score(pd.Series(y_binary_np), pd.Series(y_binary_np)), expected_value) + np.testing.assert_almost_equal( + objective.score(y_binary_np, y_binary_np), expected_value + ) + np.testing.assert_almost_equal( + objective.score(pd.Series(y_binary_np), pd.Series(y_binary_np)), + expected_value, + ) if isinstance(objective, MulticlassClassificationObjective): y_predicted = y_pred_multi_np y_predicted_pd = pd.Series(y_predicted) if objective.score_needs_proba: y_predicted = y_pred_proba_multi_np y_predicted_pd = pd.DataFrame(y_predicted) - np.testing.assert_almost_equal(objective.score(y_true_multi_np, y_predicted), expected_value) - np.testing.assert_almost_equal(objective.score(pd.Series(y_true_multi_np), y_predicted_pd), expected_value) + np.testing.assert_almost_equal( + objective.score(y_true_multi_np, y_predicted), expected_value + ) + np.testing.assert_almost_equal( + objective.score(pd.Series(y_true_multi_np), y_predicted_pd), + expected_value, + ) def test_is_defined_for_problem_type(): assert LogLossBinary.is_defined_for_problem_type(ProblemTypes.BINARY) - assert LogLossBinary.is_defined_for_problem_type('binary') + assert LogLossBinary.is_defined_for_problem_type("binary") assert not LogLossBinary.is_defined_for_problem_type(ProblemTypes.MULTICLASS) diff --git a/evalml/tests/objective_tests/test_sla.py b/evalml/tests/objective_tests/test_sla.py index f417f7bf1b..a2d44c1bdb 100644 --- a/evalml/tests/objective_tests/test_sla.py +++ b/evalml/tests/objective_tests/test_sla.py @@ -4,7 +4,7 @@ from evalml.objectives import SensitivityLowAlert from evalml.tests.objective_tests.test_binary_classification_objective import ( - TestBinaryObjective + TestBinaryObjective, ) @@ -29,21 +29,40 @@ def test_invalid_alert_rate(self, alert_rate): with pytest.raises(ValueError): SensitivityLowAlert(alert_rate) - @pytest.mark.parametrize("alert_rate, ypred_proba, high_risk", [ - (0.1, pd.Series([0.5, 0.5, 0.5]), [True, True, True]), - (0.1, list(range(10)), [False if i != 9 else True for i in range(10)])]) + @pytest.mark.parametrize( + "alert_rate, ypred_proba, high_risk", + [ + (0.1, pd.Series([0.5, 0.5, 0.5]), [True, True, True]), + (0.1, list(range(10)), [False if i != 9 else True for i in range(10)]), + ], + ) def test_high_risk_output(self, alert_rate, ypred_proba, high_risk): self.assign_objective(alert_rate) assert self.objective.decision_function(ypred_proba).tolist() == high_risk - @pytest.mark.parametrize("y_true, y_predicted, expected_score", [ - (pd.Series([False, False, False]), pd.Series([True, True, False]), np.nan), - (pd.Series([True, True, True, True]), pd.Series([True, True, False, False]), 0.5)]) + @pytest.mark.parametrize( + "y_true, y_predicted, expected_score", + [ + (pd.Series([False, False, False]), pd.Series([True, True, False]), np.nan), + ( + pd.Series([True, True, True, True]), + pd.Series([True, True, False, False]), + 0.5, + ), + ], + ) def test_score(self, y_true, y_predicted, expected_score): sensitivity = SensitivityLowAlert(0.1).objective_function(y_true, y_predicted) assert (sensitivity is expected_score) or (sensitivity == expected_score) - def test_all_base_tests(self, fix_y_pred_na, fix_y_true, fix_y_pred_diff_len, fix_empty_array, fix_y_pred_multi): + def test_all_base_tests( + self, + fix_y_pred_na, + fix_y_true, + fix_y_pred_diff_len, + fix_empty_array, + fix_y_pred_multi, + ): self.assign_objective(0.1) self.input_contains_nan_inf(fix_y_pred_na, fix_y_true) self.different_input_lengths(fix_y_pred_diff_len, fix_y_true) diff --git a/evalml/tests/objective_tests/test_standard_metrics.py b/evalml/tests/objective_tests/test_standard_metrics.py index 50c1f045b8..c16a6bdf4d 100644 --- a/evalml/tests/objective_tests/test_standard_metrics.py +++ b/evalml/tests/objective_tests/test_standard_metrics.py @@ -32,16 +32,17 @@ RecallMicro, RecallWeighted, RootMeanSquaredError, - RootMeanSquaredLogError -) -from evalml.objectives.utils import ( - _all_objectives_dict, - get_non_core_objectives + RootMeanSquaredLogError, ) +from evalml.objectives.utils import _all_objectives_dict, get_non_core_objectives EPS = 1e-5 all_automl_objectives = _all_objectives_dict() -all_automl_objectives = {name: class_() for name, class_ in all_automl_objectives.items() if class_ not in get_non_core_objectives()} +all_automl_objectives = { + name: class_() + for name, class_ in all_automl_objectives.items() + if class_ not in get_non_core_objectives() +} def test_input_contains_nan(): @@ -61,7 +62,9 @@ def test_input_contains_nan(): y_predicted_proba = np.array([[1, np.nan], [0.1, 0]]) for objective in all_automl_objectives.values(): if objective.score_needs_proba: - with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): + with pytest.raises( + ValueError, match="y_predicted contains NaN or infinity" + ): objective.score(y_true, y_predicted_proba) @@ -82,7 +85,9 @@ def test_input_contains_inf(): y_predicted_proba = np.array([[1, np.inf], [0.1, 0]]) for objective in all_automl_objectives.values(): if objective.score_needs_proba: - with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): + with pytest.raises( + ValueError, match="y_predicted contains NaN or infinity" + ): objective.score(y_true, y_predicted_proba) @@ -113,21 +118,27 @@ def test_probabilities_not_in_0_1_range(): y_true = np.array([1, 0, 1]) for objective in all_automl_objectives.values(): if objective.score_needs_proba: - with pytest.raises(ValueError, match="y_predicted contains probability estimates"): + with pytest.raises( + ValueError, match="y_predicted contains probability estimates" + ): objective.score(y_true, y_predicted) y_predicted = np.array([0.3, -0.001, 0.3]) y_true = np.array([1, 0, 1]) for objective in all_automl_objectives.values(): if objective.score_needs_proba: - with pytest.raises(ValueError, match="y_predicted contains probability estimates"): + with pytest.raises( + ValueError, match="y_predicted contains probability estimates" + ): objective.score(y_true, y_predicted) y_true = np.array([1, 0]) y_predicted_proba = np.array([[1, 3], [0.1, 0]]) for objective in all_automl_objectives.values(): if objective.score_needs_proba: - with pytest.raises(ValueError, match="y_predicted contains probability estimates"): + with pytest.raises( + ValueError, match="y_predicted contains probability estimates" + ): objective.score(y_true, y_predicted_proba) @@ -135,7 +146,10 @@ def test_negative_with_log(): y_predicted = np.array([-1, 10, 30]) y_true = np.array([-1, 0, 1]) for objective in [MeanSquaredLogError(), RootMeanSquaredLogError()]: - with pytest.raises(ValueError, match="Mean Squared Logarithmic Error cannot be used when targets contain negative values."): + with pytest.raises( + ValueError, + match="Mean Squared Logarithmic Error cannot be used when targets contain negative values.", + ): objective.score(y_true, y_predicted) @@ -143,243 +157,292 @@ def test_binary_more_than_two_unique_values(): y_predicted = np.array([0, 1, 2]) y_true = np.array([1, 0, 1]) for objective in all_automl_objectives.values(): - if isinstance(objective, BinaryClassificationObjective) and not objective.score_needs_proba: - with pytest.raises(ValueError, match="y_predicted contains more than two unique values"): + if ( + isinstance(objective, BinaryClassificationObjective) + and not objective.score_needs_proba + ): + with pytest.raises( + ValueError, match="y_predicted contains more than two unique values" + ): objective.score(y_true, y_predicted) y_true = np.array([0, 1, 2]) y_predicted = np.array([1, 0, 1]) for objective in all_automl_objectives.values(): - if isinstance(objective, BinaryClassificationObjective) and not objective.score_needs_proba: - with pytest.raises(ValueError, match="y_true contains more than two unique values"): + if ( + isinstance(objective, BinaryClassificationObjective) + and not objective.score_needs_proba + ): + with pytest.raises( + ValueError, match="y_true contains more than two unique values" + ): objective.score(y_true, y_predicted) def test_accuracy_binary(): obj = AccuracyBinary() - assert obj.score(np.array([0, 0, 1, 1]), - np.array([1, 1, 0, 0])) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0, 1, 1]), - np.array([0, 1, 0, 1])) == pytest.approx(0.5, EPS) - assert obj.score(np.array([0, 0, 1, 1]), - np.array([0, 0, 1, 1])) == pytest.approx(1.0, EPS) + assert obj.score(np.array([0, 0, 1, 1]), np.array([1, 1, 0, 0])) == pytest.approx( + 0.0, EPS + ) + assert obj.score(np.array([0, 0, 1, 1]), np.array([0, 1, 0, 1])) == pytest.approx( + 0.5, EPS + ) + assert obj.score(np.array([0, 0, 1, 1]), np.array([0, 0, 1, 1])) == pytest.approx( + 1.0, EPS + ) def test_accuracy_multi(): obj = AccuracyMulticlass() - assert obj.score(np.array([0, 0, 1, 1]), - np.array([1, 1, 0, 0])) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0, 1, 1]), - np.array([0, 1, 0, 1])) == pytest.approx(0.5, EPS) - assert obj.score(np.array([0, 0, 1, 1]), - np.array([0, 0, 1, 1])) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 1, 1, 2, 2]), - np.array([0, 0, 0, 0, 0, 0])) == pytest.approx(1 / 3.0, EPS) - assert obj.score(np.array([0, 0, 0, 0, 0, 0]), - np.array([0, 0, 1, 1, 2, 2])) == pytest.approx(1 / 3.0, EPS) + assert obj.score(np.array([0, 0, 1, 1]), np.array([1, 1, 0, 0])) == pytest.approx( + 0.0, EPS + ) + assert obj.score(np.array([0, 0, 1, 1]), np.array([0, 1, 0, 1])) == pytest.approx( + 0.5, EPS + ) + assert obj.score(np.array([0, 0, 1, 1]), np.array([0, 0, 1, 1])) == pytest.approx( + 1.0, EPS + ) + assert obj.score( + np.array([0, 0, 1, 1, 2, 2]), np.array([0, 0, 0, 0, 0, 0]) + ) == pytest.approx(1 / 3.0, EPS) + assert obj.score( + np.array([0, 0, 0, 0, 0, 0]), np.array([0, 0, 1, 1, 2, 2]) + ) == pytest.approx(1 / 3.0, EPS) def test_balanced_accuracy_binary(): obj = BalancedAccuracyBinary() - assert obj.score(np.array([0, 1, 0, 0, 1, 0]), - np.array([0, 1, 0, 0, 0, 1])) == pytest.approx(0.625, EPS) + assert obj.score( + np.array([0, 1, 0, 0, 1, 0]), np.array([0, 1, 0, 0, 0, 1]) + ) == pytest.approx(0.625, EPS) - assert obj.score(np.array([0, 1, 0, 0, 1, 0]), - np.array([0, 1, 0, 0, 1, 0])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 1, 0, 0, 1, 0]), np.array([0, 1, 0, 0, 1, 0]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 1, 0, 0, 1, 0]), - np.array([1, 0, 1, 1, 0, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 1, 0, 0, 1, 0]), np.array([1, 0, 1, 1, 0, 1]) + ) == pytest.approx(0.0, EPS) def test_balanced_accuracy_multi(): obj = BalancedAccuracyMulticlass() - assert obj.score(np.array([0, 1, 2, 0, 1, 2, 3]), - np.array([0, 0, 2, 0, 0, 2, 3])) == pytest.approx(0.75, EPS) + assert obj.score( + np.array([0, 1, 2, 0, 1, 2, 3]), np.array([0, 0, 2, 0, 0, 2, 3]) + ) == pytest.approx(0.75, EPS) - assert obj.score(np.array([0, 1, 2, 0, 1, 2, 3]), - np.array([0, 1, 2, 0, 1, 2, 3])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 1, 2, 0, 1, 2, 3]), np.array([0, 1, 2, 0, 1, 2, 3]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 1, 2, 0, 1, 2, 3]), - np.array([1, 0, 3, 1, 2, 1, 0])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 1, 2, 0, 1, 2, 3]), np.array([1, 0, 3, 1, 2, 1, 0]) + ) == pytest.approx(0.0, EPS) def test_f1_binary(): obj = F1() - assert obj.score(np.array([0, 1, 0, 0, 1, 0]), - np.array([0, 1, 0, 0, 0, 1])) == pytest.approx(0.5, EPS) + assert obj.score( + np.array([0, 1, 0, 0, 1, 0]), np.array([0, 1, 0, 0, 0, 1]) + ) == pytest.approx(0.5, EPS) - assert obj.score(np.array([0, 1, 0, 0, 1, 1]), - np.array([0, 1, 0, 0, 1, 1])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 1, 0, 0, 1, 1]), np.array([0, 1, 0, 0, 1, 1]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 0, 1, 0]), - np.array([0, 1, 0, 0, 0, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 0, 1, 0]), np.array([0, 1, 0, 0, 0, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([0, 0])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([0, 0])) == pytest.approx(0.0, EPS) def test_f1_micro_multi(): obj = F1Micro() - assert obj.score(np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1 / 3.0, EPS) + assert obj.score( + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1 / 3.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([1, 2]), - np.array([0, 0])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([1, 2]), np.array([0, 0])) == pytest.approx(0.0, EPS) def test_f1_macro_multi(): obj = F1Macro() - assert obj.score(np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) \ - == pytest.approx(2 * (1 / 3.0) * (1 / 9.0) / (1 / 3.0 + 1 / 9.0), EPS) + assert obj.score( + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(2 * (1 / 3.0) * (1 / 9.0) / (1 / 3.0 + 1 / 9.0), EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([1, 2]), - np.array([0, 0])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([1, 2]), np.array([0, 0])) == pytest.approx(0.0, EPS) def test_f1_weighted_multi(): obj = F1Weighted() - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])) \ - == pytest.approx(2 * (1 / 3.0) * (1 / 9.0) / (1 / 3.0 + 1 / 9.0), EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) + ) == pytest.approx(2 * (1 / 3.0) * (1 / 9.0) / (1 / 3.0 + 1 / 9.0), EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([2, 2, 2, 0, 0, 0, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([1, 2])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([1, 2])) == pytest.approx(0.0, EPS) def test_precision_binary(): obj = Precision() - assert obj.score(np.array([1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 1, 1, 1])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 1, 1, 1]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1]), - np.array([1, 1, 1, 1, 1, 1])) == pytest.approx(0.5, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1]), np.array([1, 1, 1, 1, 1, 1]) + ) == pytest.approx(0.5, EPS) - assert obj.score(np.array([0, 0, 0, 0, 0, 0]), - np.array([1, 1, 1, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 0, 0, 0]), np.array([1, 1, 1, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0, 0, 0, 0, 0]), - np.array([0, 0, 0, 0, 0, 0])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 0, 0, 0]), np.array([0, 0, 0, 0, 0, 0]) + ) == pytest.approx(0.0, EPS) def test_precision_micro_multi(): obj = PrecisionMicro() - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])) == pytest.approx(1 / 3.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) + ) == pytest.approx(1 / 3.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([2, 2, 2, 0, 0, 0, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([1, 2])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([1, 2])) == pytest.approx(0.0, EPS) def test_precision_macro_multi(): obj = PrecisionMacro() - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])) == pytest.approx(1 / 9.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) + ) == pytest.approx(1 / 9.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([2, 2, 2, 0, 0, 0, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([1, 2])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([1, 2])) == pytest.approx(0.0, EPS) def test_precision_weighted_multi(): obj = PrecisionWeighted() - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])) == pytest.approx(1 / 9.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) + ) == pytest.approx(1 / 9.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([2, 2, 2, 0, 0, 0, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([1, 2])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([1, 2])) == pytest.approx(0.0, EPS) def test_recall_binary(): obj = Recall() - assert obj.score(np.array([0, 0, 0, 1, 1, 1]), - np.array([1, 1, 1, 1, 1, 1])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1]), np.array([1, 1, 1, 1, 1, 1]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 1, 1, 1])) == pytest.approx(0.5, EPS) + assert obj.score( + np.array([1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.5, EPS) def test_recall_micro_multi(): obj = RecallMicro() - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])) == pytest.approx(1 / 3.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) + ) == pytest.approx(1 / 3.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([2, 2, 2, 0, 0, 0, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([1, 2])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([1, 2])) == pytest.approx(0.0, EPS) def test_recall_macro_multi(): obj = RecallMacro() - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])) == pytest.approx(1 / 3.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) + ) == pytest.approx(1 / 3.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([2, 2, 2, 0, 0, 0, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([1, 2])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([1, 2])) == pytest.approx(0.0, EPS) def test_recall_weighted_multi(): obj = RecallWeighted() - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])) == pytest.approx(1 / 3.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) + ) == pytest.approx(1 / 3.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) == pytest.approx(1.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ) == pytest.approx(1.0, EPS) - assert obj.score(np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([2, 2, 2, 0, 0, 0, 1, 1, 1])) == pytest.approx(0.0, EPS) + assert obj.score( + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) + ) == pytest.approx(0.0, EPS) - assert obj.score(np.array([0, 0]), - np.array([1, 2])) == pytest.approx(0.0, EPS) + assert obj.score(np.array([0, 0]), np.array([1, 2])) == pytest.approx(0.0, EPS) def test_log_linear_model(): @@ -399,9 +462,13 @@ def test_log_linear_model(): assert obj.score(s2_predicted, s2_actual) == pytest.approx(0) assert obj.score(s3_predicted, s3_actual) == pytest.approx(0.617267976207983) - assert root_obj.score(s1_predicted, s1_actual) == pytest.approx(np.sqrt(0.562467324910)) + assert root_obj.score(s1_predicted, s1_actual) == pytest.approx( + np.sqrt(0.562467324910) + ) assert root_obj.score(s2_predicted, s2_actual) == pytest.approx(0) - assert root_obj.score(s3_predicted, s3_actual) == pytest.approx(np.sqrt(0.617267976207983)) + assert root_obj.score(s3_predicted, s3_actual) == pytest.approx( + np.sqrt(0.617267976207983) + ) def test_mse_linear_model(): @@ -417,13 +484,13 @@ def test_mse_linear_model(): s3_predicted = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) s3_actual = np.array([2, 2, 2, 0, 0, 0, 1, 1, 1]) - assert obj.score(s1_predicted, s1_actual) == pytest.approx(5. / 3.) + assert obj.score(s1_predicted, s1_actual) == pytest.approx(5.0 / 3.0) assert obj.score(s2_predicted, s2_actual) == pytest.approx(0) - assert obj.score(s3_predicted, s3_actual) == pytest.approx(2.) + assert obj.score(s3_predicted, s3_actual) == pytest.approx(2.0) - assert root_obj.score(s1_predicted, s1_actual) == pytest.approx(np.sqrt(5. / 3.)) + assert root_obj.score(s1_predicted, s1_actual) == pytest.approx(np.sqrt(5.0 / 3.0)) assert root_obj.score(s2_predicted, s2_actual) == pytest.approx(0) - assert root_obj.score(s3_predicted, s3_actual) == pytest.approx(np.sqrt(2.)) + assert root_obj.score(s3_predicted, s3_actual) == pytest.approx(np.sqrt(2.0)) def test_mcc_catches_warnings(): @@ -450,13 +517,23 @@ def test_mape_time_series_model(): s3_actual = np.array([1, 2, 4, 2, 1, 2]) s3_predicted = np.array([0, 2, 2, 1, 3, 2]) - with pytest.raises(ValueError, match="Mean Absolute Percentage Error cannot be used when targets contain the value 0."): + with pytest.raises( + ValueError, + match="Mean Absolute Percentage Error cannot be used when targets contain the value 0.", + ): obj.score(s1_actual, s1_predicted) assert obj.score(s2_actual, s2_predicted) == pytest.approx(8 / 4 * 100) assert obj.score(s3_actual, s3_predicted) == pytest.approx(4 / 6 * 100) - assert obj.score(pd.Series(s3_actual, index=range(-12, -6)), s3_predicted) == pytest.approx(4 / 6 * 100) - assert obj.score(pd.Series(s2_actual, index=range(10, 14)), - pd.Series(s2_predicted, index=range(20, 24))) == pytest.approx(8 / 4 * 100) + assert obj.score( + pd.Series(s3_actual, index=range(-12, -6)), s3_predicted + ) == pytest.approx(4 / 6 * 100) + assert ( + obj.score( + pd.Series(s2_actual, index=range(10, 14)), + pd.Series(s2_predicted, index=range(20, 24)), + ) + == pytest.approx(8 / 4 * 100) + ) @pytest.mark.parametrize("objective_class", _all_objectives_dict().values()) @@ -465,14 +542,21 @@ def test_calculate_percent_difference(objective_class): reference_score = 10 denominator = 1 if objective_class.is_bounded_like_percentage else reference_score - change = ((-1) ** (not objective_class.greater_is_better) * (score - reference_score)) / denominator + change = ( + (-1) ** (not objective_class.greater_is_better) * (score - reference_score) + ) / denominator answer = 100 * change - assert objective_class.calculate_percent_difference(score, reference_score) == answer + assert ( + objective_class.calculate_percent_difference(score, reference_score) == answer + ) assert objective_class.perfect_score is not None -@pytest.mark.parametrize("objective_class,nan_value", product(_all_objectives_dict().values(), [None, np.nan])) +@pytest.mark.parametrize( + "objective_class,nan_value", + product(_all_objectives_dict().values(), [None, np.nan]), +) def test_calculate_percent_difference_with_nan(objective_class, nan_value): assert pd.isna(objective_class.calculate_percent_difference(nan_value, 2)) @@ -482,47 +566,118 @@ def test_calculate_percent_difference_with_nan(objective_class, nan_value): @pytest.mark.parametrize("baseline_score", [0, 1e-11]) @pytest.mark.parametrize("objective_class", _all_objectives_dict().values()) -def test_calculate_percent_difference_when_baseline_0_or_close_to_0(objective_class, baseline_score): +def test_calculate_percent_difference_when_baseline_0_or_close_to_0( + objective_class, baseline_score +): percent_difference = objective_class.calculate_percent_difference(2, baseline_score) if objective_class.is_bounded_like_percentage: - assert percent_difference == ((-1) ** (not objective_class.greater_is_better)) * (2 - baseline_score) * 100 + assert ( + percent_difference + == ((-1) ** (not objective_class.greater_is_better)) + * (2 - baseline_score) + * 100 + ) else: assert np.isinf(percent_difference) def test_calculate_percent_difference_negative_and_equal_numbers(): - assert CostBenefitMatrix.calculate_percent_difference(score=5, baseline_score=5) == 0 - assert CostBenefitMatrix.calculate_percent_difference(score=5.003, baseline_score=5.003 - 1e-11) == 0 - assert CostBenefitMatrix.calculate_percent_difference(score=-5, baseline_score=-10) == 50 - assert CostBenefitMatrix.calculate_percent_difference(score=-10, baseline_score=-5) == -100 - assert CostBenefitMatrix.calculate_percent_difference(score=-5, baseline_score=10) == -150 - assert CostBenefitMatrix.calculate_percent_difference(score=10, baseline_score=-5) == 300 + assert ( + CostBenefitMatrix.calculate_percent_difference(score=5, baseline_score=5) == 0 + ) + assert ( + CostBenefitMatrix.calculate_percent_difference( + score=5.003, baseline_score=5.003 - 1e-11 + ) + == 0 + ) + assert ( + CostBenefitMatrix.calculate_percent_difference(score=-5, baseline_score=-10) + == 50 + ) + assert ( + CostBenefitMatrix.calculate_percent_difference(score=-10, baseline_score=-5) + == -100 + ) + assert ( + CostBenefitMatrix.calculate_percent_difference(score=-5, baseline_score=10) + == -150 + ) + assert ( + CostBenefitMatrix.calculate_percent_difference(score=10, baseline_score=-5) + == 300 + ) # These values are not possible for LogLossBinary but we need them for 100% coverage # We might add an objective where lower is better that can take negative values in the future - assert LogLossBinary.calculate_percent_difference(score=-5, baseline_score=-10) == -50 - assert LogLossBinary.calculate_percent_difference(score=5.003, baseline_score=5.003 + 1e-11) == 0 - assert LogLossBinary.calculate_percent_difference(score=-10, baseline_score=-5) == 100 - assert LogLossBinary.calculate_percent_difference(score=-5, baseline_score=10) == 150 - assert LogLossBinary.calculate_percent_difference(score=10, baseline_score=-5) == -300 + assert ( + LogLossBinary.calculate_percent_difference(score=-5, baseline_score=-10) == -50 + ) + assert ( + LogLossBinary.calculate_percent_difference( + score=5.003, baseline_score=5.003 + 1e-11 + ) + == 0 + ) + assert ( + LogLossBinary.calculate_percent_difference(score=-10, baseline_score=-5) == 100 + ) + assert ( + LogLossBinary.calculate_percent_difference(score=-5, baseline_score=10) == 150 + ) + assert ( + LogLossBinary.calculate_percent_difference(score=10, baseline_score=-5) == -300 + ) # Verify percent_difference is 0 when numbers are close to equal for objective that is bounded in [0, 1] assert AccuracyBinary.calculate_percent_difference(score=5, baseline_score=5) == 0 - assert AccuracyBinary.calculate_percent_difference(score=5.003, baseline_score=5.003 + 1e-11) == 0 + assert ( + AccuracyBinary.calculate_percent_difference( + score=5.003, baseline_score=5.003 + 1e-11 + ) + == 0 + ) def test_calculate_percent_difference_small(): expected_value = 100 * -1 * np.abs(1e-9 / (1e-9)) - assert np.isclose(ExpVariance.calculate_percent_difference(score=0, baseline_score=1e-9), expected_value, atol=1e-8) + assert np.isclose( + ExpVariance.calculate_percent_difference(score=0, baseline_score=1e-9), + expected_value, + atol=1e-8, + ) assert ExpVariance.calculate_percent_difference(score=0, baseline_score=1e-10) == 0 - assert ExpVariance.calculate_percent_difference(score=2e-10, baseline_score=1e-10) == 0 - assert np.isinf(ExpVariance.calculate_percent_difference(score=1e-9, baseline_score=0)) - assert np.isinf(ExpVariance.calculate_percent_difference(score=0.1, baseline_score=1e-11)) + assert ( + ExpVariance.calculate_percent_difference(score=2e-10, baseline_score=1e-10) == 0 + ) + assert np.isinf( + ExpVariance.calculate_percent_difference(score=1e-9, baseline_score=0) + ) + assert np.isinf( + ExpVariance.calculate_percent_difference(score=0.1, baseline_score=1e-11) + ) expected_value = 100 * np.abs(1e-9) - assert np.isclose(AccuracyBinary.calculate_percent_difference(score=0, baseline_score=1e-9), expected_value, atol=1e-6) - assert AccuracyBinary.calculate_percent_difference(score=0, baseline_score=1e-10) == 0 - assert AccuracyBinary.calculate_percent_difference(score=2e-10, baseline_score=1e-10) == 0 - assert np.isclose(AccuracyBinary.calculate_percent_difference(score=1e-9, baseline_score=0), expected_value, atol=1e-6) - assert np.isclose(AccuracyBinary.calculate_percent_difference(score=0.1, baseline_score=1e-11), 100 * np.abs(0.1 - 1e-11), atol=1e-6) + assert np.isclose( + AccuracyBinary.calculate_percent_difference(score=0, baseline_score=1e-9), + expected_value, + atol=1e-6, + ) + assert ( + AccuracyBinary.calculate_percent_difference(score=0, baseline_score=1e-10) == 0 + ) + assert ( + AccuracyBinary.calculate_percent_difference(score=2e-10, baseline_score=1e-10) + == 0 + ) + assert np.isclose( + AccuracyBinary.calculate_percent_difference(score=1e-9, baseline_score=0), + expected_value, + atol=1e-6, + ) + assert np.isclose( + AccuracyBinary.calculate_percent_difference(score=0.1, baseline_score=1e-11), + 100 * np.abs(0.1 - 1e-11), + atol=1e-6, + ) diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py index bc00195c06..974bd403a1 100644 --- a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py +++ b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py @@ -10,72 +10,72 @@ def test_binary_init(): - clf = BinaryClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"]) + clf = BinaryClassificationPipeline( + component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"] + ) assert clf.parameters == { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'Random Forest Classifier': { - 'n_estimators': 100, - 'max_depth': 6, - 'n_jobs': -1 - } + "Random Forest Classifier": {"n_estimators": 100, "max_depth": 6, "n_jobs": -1}, } assert clf.name == "Random Forest Classifier w/ Imputer + One Hot Encoder" assert clf.random_seed == 0 - parameters = { - "One Hot Encoder": { - "top_n": 20 - } - } - clf = BinaryClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"], - parameters=parameters, - custom_name="Custom Pipeline", - random_seed=42) + parameters = {"One Hot Encoder": {"top_n": 20}} + clf = BinaryClassificationPipeline( + component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"], + parameters=parameters, + custom_name="Custom Pipeline", + random_seed=42, + ) assert clf.parameters == { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 20, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 20, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'Random Forest Classifier': { - 'n_estimators': 100, - 'max_depth': 6, - 'n_jobs': -1 - } + "Random Forest Classifier": {"n_estimators": 100, "max_depth": 6, "n_jobs": -1}, } assert clf.name == "Custom Pipeline" assert clf.random_seed == 42 -@patch('evalml.pipelines.ClassificationPipeline._decode_targets', return_value=[0, 1]) -@patch('evalml.objectives.BinaryClassificationObjective.decision_function', return_value=pd.Series([1, 0])) -@patch('evalml.pipelines.components.Estimator.predict_proba') -@patch('evalml.pipelines.components.Estimator.predict') -def test_binary_classification_pipeline_predict(mock_predict, mock_predict_proba, - mock_obj_decision, mock_decode, - X_y_binary, dummy_binary_pipeline_class): +@patch("evalml.pipelines.ClassificationPipeline._decode_targets", return_value=[0, 1]) +@patch( + "evalml.objectives.BinaryClassificationObjective.decision_function", + return_value=pd.Series([1, 0]), +) +@patch("evalml.pipelines.components.Estimator.predict_proba") +@patch("evalml.pipelines.components.Estimator.predict") +def test_binary_classification_pipeline_predict( + mock_predict, + mock_predict_proba, + mock_obj_decision, + mock_decode, + X_y_binary, + dummy_binary_pipeline_class, +): proba = pd.DataFrame([[0.1, 0.2], [0.1, 0.2]]) proba.ww.init() predict = ww.init_series(pd.Series([1, 0])) @@ -84,7 +84,9 @@ def test_binary_classification_pipeline_predict(mock_predict, mock_predict_proba mock_objs = [mock_decode, mock_predict] X, y = X_y_binary - binary_pipeline = dummy_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + binary_pipeline = dummy_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) # test no objective passed and no custom threshold uses underlying estimator's predict method binary_pipeline.fit(X, y) binary_pipeline.predict(X) @@ -93,7 +95,7 @@ def test_binary_classification_pipeline_predict(mock_predict, mock_predict_proba mock_obj.reset_mock() # test objective passed but no custom threshold uses underlying estimator's predict method - binary_pipeline.predict(X, 'precision') + binary_pipeline.predict(X, "precision") for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() @@ -120,7 +122,7 @@ def test_binary_classification_pipeline_predict(mock_predict, mock_predict_proba # test custom threshold set and objective passed binary_pipeline.threshold = 0.6 - binary_pipeline.predict(X, 'precision') + binary_pipeline.predict(X, "precision") for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() @@ -128,48 +130,77 @@ def test_binary_classification_pipeline_predict(mock_predict, mock_predict_proba mock_obj_decision.assert_called() -@patch('evalml.pipelines.ComponentGraph._compute_features') -def test_binary_predict_pipeline_objective_mismatch(mock_transform, X_y_binary, dummy_binary_pipeline_class): +@patch("evalml.pipelines.ComponentGraph._compute_features") +def test_binary_predict_pipeline_objective_mismatch( + mock_transform, X_y_binary, dummy_binary_pipeline_class +): X, y = X_y_binary - binary_pipeline = dummy_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + binary_pipeline = dummy_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) binary_pipeline.fit(X, y) - with pytest.raises(ValueError, match="You can only use a binary classification objective to make predictions for a binary classification pipeline."): + with pytest.raises( + ValueError, + match="You can only use a binary classification objective to make predictions for a binary classification pipeline.", + ): binary_pipeline.predict(X, "precision micro") mock_transform.assert_called() -@patch('evalml.objectives.FraudCost.decision_function') -def test_binary_predict_pipeline_use_objective(mock_decision_function, X_y_binary, logistic_regression_binary_pipeline_class): +@patch("evalml.objectives.FraudCost.decision_function") +def test_binary_predict_pipeline_use_objective( + mock_decision_function, X_y_binary, logistic_regression_binary_pipeline_class +): X, y = X_y_binary - binary_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + binary_pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) mock_decision_function.return_value = pd.Series([0] * 100) binary_pipeline.threshold = 0.7 binary_pipeline.fit(X, y) fraud_cost = FraudCost(amount_col=0) - binary_pipeline.score(X, y, ['precision', 'auc', fraud_cost]) + binary_pipeline.score(X, y, ["precision", "auc", fraud_cost]) mock_decision_function.assert_called() -def test_binary_predict_pipeline_score_error(X_y_binary, logistic_regression_binary_pipeline_class): +def test_binary_predict_pipeline_score_error( + X_y_binary, logistic_regression_binary_pipeline_class +): X, y = X_y_binary - binary_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + binary_pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) binary_pipeline.fit(X, y) - with pytest.raises(PipelineScoreError, match='Invalid objective MCC Multiclass specified for problem type binary'): - binary_pipeline.score(X, y, ['MCC Multiclass']) - - -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.BinaryClassificationPipeline.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba') -def test_pipeline_thresholding_errors(mock_binary_pred_proba, mock_binary_score, mock_binary_fit, - make_data_type, logistic_regression_binary_pipeline_class, X_y_binary): + with pytest.raises( + PipelineScoreError, + match="Invalid objective MCC Multiclass specified for problem type binary", + ): + binary_pipeline.score(X, y, ["MCC Multiclass"]) + + +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.pipelines.BinaryClassificationPipeline.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba") +def test_pipeline_thresholding_errors( + mock_binary_pred_proba, + mock_binary_score, + mock_binary_fit, + make_data_type, + logistic_regression_binary_pipeline_class, + X_y_binary, +): X, y = X_y_binary - X = make_data_type('ww', X) - y = make_data_type('ww', pd.Series([f"String value {i}" for i in y])) + X = make_data_type("ww", X) + y = make_data_type("ww", pd.Series([f"String value {i}" for i in y])) objective = get_objective("Log Loss Binary", return_instance=True) - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) pred_proba = pipeline.predict_proba(X, y).iloc[:, 1] - with pytest.raises(ValueError, match="Problem type must be binary and objective must be optimizable"): + with pytest.raises( + ValueError, + match="Problem type must be binary and objective must be optimizable", + ): pipeline.optimize_threshold(X, y, pred_proba, objective) diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_classification.py index 51d89ef864..608600a8ed 100644 --- a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_classification.py +++ b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_classification.py @@ -8,42 +8,63 @@ @pytest.mark.parametrize("problem_type", ["binary", "multi"]) -def test_new_unique_targets_in_score(X_y_binary, logistic_regression_binary_pipeline_class, - X_y_multi, logistic_regression_multiclass_pipeline_class, problem_type): +def test_new_unique_targets_in_score( + X_y_binary, + logistic_regression_binary_pipeline_class, + X_y_multi, + logistic_regression_multiclass_pipeline_class, + problem_type, +): if problem_type == "binary": X, y = X_y_binary - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) - objective = 'Log Loss Binary' + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) + objective = "Log Loss Binary" elif problem_type == "multi": X, y = X_y_multi - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) - objective = 'Log Loss Multiclass' + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) + objective = "Log Loss Multiclass" pipeline.fit(X, y) with pytest.raises(ValueError, match="y contains previously unseen labels"): pipeline.score(X, pd.Series([4] * len(y)), [objective]) -@pytest.mark.parametrize("problem_type,use_ints", product(["binary", "multi"], [True, False])) -def test_pipeline_has_classes_property(logistic_regression_binary_pipeline_class, - logistic_regression_multiclass_pipeline_class, problem_type, use_ints): +@pytest.mark.parametrize( + "problem_type,use_ints", product(["binary", "multi"], [True, False]) +) +def test_pipeline_has_classes_property( + logistic_regression_binary_pipeline_class, + logistic_regression_multiclass_pipeline_class, + problem_type, + use_ints, +): if problem_type == "binary": X, y = load_breast_cancer() - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) if use_ints: - y = y.map({'malignant': 0, 'benign': 1}) + y = y.map({"malignant": 0, "benign": 1}) answer = [0, 1] else: answer = ["benign", "malignant"] elif problem_type == "multi": X, y = load_wine() - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) if use_ints: y = y.map({"class_0": 0, "class_1": 1, "class_2": 2}) answer = [0, 1, 2] else: answer = ["class_0", "class_1", "class_2"] - with pytest.raises(AttributeError, match="Cannot access class names before fitting the pipeline."): + with pytest.raises( + AttributeError, match="Cannot access class names before fitting the pipeline." + ): pipeline.classes_ pipeline.fit(X, y) @@ -52,7 +73,9 @@ def test_pipeline_has_classes_property(logistic_regression_binary_pipeline_class def test_woodwork_classification_pipeline(logistic_regression_binary_pipeline_class): X, y = load_breast_cancer() - mock_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + mock_pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) mock_pipeline.fit(X, y) assert not pd.isnull(mock_pipeline.predict(X)).any() assert not pd.isnull(mock_pipeline.predict_proba(X)).any().any() diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py index ef988bd87a..6be37a6fa6 100644 --- a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py +++ b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_multiclass_classification.py @@ -1,62 +1,53 @@ - from evalml.pipelines import MulticlassClassificationPipeline def test_multiclass_init(): - clf = MulticlassClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"]) + clf = MulticlassClassificationPipeline( + component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"] + ) assert clf.parameters == { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'Random Forest Classifier': { - 'n_estimators': 100, - 'max_depth': 6, - 'n_jobs': -1 - } + "Random Forest Classifier": {"n_estimators": 100, "max_depth": 6, "n_jobs": -1}, } assert clf.name == "Random Forest Classifier w/ Imputer + One Hot Encoder" assert clf.random_seed == 0 - parameters = { - "One Hot Encoder": { - "top_n": 20 - } - } - clf = MulticlassClassificationPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"], - parameters=parameters, - custom_name="Custom Pipeline", - random_seed=42) + parameters = {"One Hot Encoder": {"top_n": 20}} + clf = MulticlassClassificationPipeline( + component_graph=["Imputer", "One Hot Encoder", "Random Forest Classifier"], + parameters=parameters, + custom_name="Custom Pipeline", + random_seed=42, + ) assert clf.parameters == { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 20, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 20, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'Random Forest Classifier': { - 'n_estimators': 100, - 'max_depth': 6, - 'n_jobs': -1 - } + "Random Forest Classifier": {"n_estimators": 100, "max_depth": 6, "n_jobs": -1}, } assert clf.name == "Custom Pipeline" assert clf.random_seed == 42 diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py index 1458000f14..fd64f3920f 100644 --- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py +++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py @@ -7,67 +7,61 @@ def test_regression_init(): - clf = RegressionPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Regressor"]) + clf = RegressionPipeline( + component_graph=["Imputer", "One Hot Encoder", "Random Forest Regressor"] + ) assert clf.parameters == { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'Random Forest Regressor': { - 'n_estimators': 100, - 'max_depth': 6, - 'n_jobs': -1 - } + "Random Forest Regressor": {"n_estimators": 100, "max_depth": 6, "n_jobs": -1}, } assert clf.name == "Random Forest Regressor w/ Imputer + One Hot Encoder" assert clf.random_seed == 0 - parameters = { - "One Hot Encoder": { - "top_n": 20 - } - } - clf = RegressionPipeline(component_graph=["Imputer", "One Hot Encoder", "Random Forest Regressor"], - parameters=parameters, - custom_name="Custom Pipeline", - random_seed=42) + parameters = {"One Hot Encoder": {"top_n": 20}} + clf = RegressionPipeline( + component_graph=["Imputer", "One Hot Encoder", "Random Forest Regressor"], + parameters=parameters, + custom_name="Custom Pipeline", + random_seed=42, + ) assert clf.parameters == { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 20, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 20, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'Random Forest Regressor': { - 'n_estimators': 100, - 'max_depth': 6, - 'n_jobs': -1 - } + "Random Forest Regressor": {"n_estimators": 100, "max_depth": 6, "n_jobs": -1}, } assert clf.name == "Custom Pipeline" assert clf.random_seed == 42 @pytest.mark.parametrize("target_type", ["category", "string", "bool"]) -def test_invalid_targets_regression_pipeline(target_type, dummy_regression_pipeline_class): +def test_invalid_targets_regression_pipeline( + target_type, dummy_regression_pipeline_class +): X, y = load_wine() if target_type == "category": y = pd.Series(y).astype("category") @@ -75,22 +69,31 @@ def test_invalid_targets_regression_pipeline(target_type, dummy_regression_pipel X, y = load_breast_cancer() y = y.map({"malignant": False, "benign": True}) mock_regression_pipeline = dummy_regression_pipeline_class(parameters={}) - with pytest.raises(ValueError, match="Regression pipeline can only handle numeric target data"): + with pytest.raises( + ValueError, match="Regression pipeline can only handle numeric target data" + ): mock_regression_pipeline.fit(X, y) def test_woodwork_regression_pipeline(linear_regression_pipeline_class): X, y = load_diabetes() - regression_pipeline = linear_regression_pipeline_class(parameters={'Linear Regressor': {'n_jobs': 1}}) + regression_pipeline = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) regression_pipeline.fit(X, y) assert not pd.isnull(regression_pipeline.predict(X)).any() def test_custom_indices(): - X = pd.DataFrame({"a": ["a", "b", "a", "a", "a", "c", "c", "c"], "b": [0, 1, 1, 1, 1, 1, 0, 1]}) + X = pd.DataFrame( + {"a": ["a", "b", "a", "a", "a", "c", "c", "c"], "b": [0, 1, 1, 1, 1, 1, 0, 1]} + ) y = pd.Series([0, 0, 0, 1, 0, 1, 0, 0], index=[7, 2, 1, 4, 5, 3, 6, 8]) - x1, x2, y1, y2 = split_data(X, y, problem_type='regression') + x1, x2, y1, y2 = split_data(X, y, problem_type="regression") - pipeline = RegressionPipeline(component_graph=['Imputer', 'One Hot Encoder', 'Linear Regressor'], parameters={}) + pipeline = RegressionPipeline( + component_graph=["Imputer", "One Hot Encoder", "Linear Regressor"], + parameters={}, + ) pipeline.fit(x2, y2) assert not pd.isnull(pipeline.predict(X)).any() diff --git a/evalml/tests/pipeline_tests/test_component_graph.py b/evalml/tests/pipeline_tests/test_component_graph.py index 9cc55c086a..1a0f577452 100644 --- a/evalml/tests/pipeline_tests/test_component_graph.py +++ b/evalml/tests/pipeline_tests/test_component_graph.py @@ -3,11 +3,7 @@ import numpy as np import pandas as pd import pytest -from pandas.testing import ( - assert_frame_equal, - assert_index_equal, - assert_series_equal -) +from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal from evalml.exceptions import MissingComponentError from evalml.pipelines import ComponentGraph @@ -22,7 +18,7 @@ StandardScaler, TargetImputer, Transformer, - Undersampler + Undersampler, ) from evalml.utils import infer_feature_types @@ -31,7 +27,9 @@ class DummyTransformer(Transformer): name = "Dummy Transformer" def __init__(self, parameters={}, random_seed=0): - super().__init__(parameters=parameters, component_obj=None, random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y): return self @@ -55,7 +53,9 @@ class DummyEstimator(Estimator): supported_problem_types = None def __init__(self, parameters={}, random_seed=0): - super().__init__(parameters=parameters, component_obj=None, random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) def fit(self, X, y): return self @@ -80,12 +80,18 @@ def dummy_components(): @pytest.fixture def example_graph(): - graph = {'Imputer': [Imputer], - 'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'], - 'OneHot_ElasticNet': [OneHotEncoder, 'Imputer.x'], - 'Random Forest': [RandomForestClassifier, 'OneHot_RandomForest.x'], - 'Elastic Net': [ElasticNetClassifier, 'OneHot_ElasticNet.x'], - 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']} + graph = { + "Imputer": [Imputer], + "OneHot_RandomForest": [OneHotEncoder, "Imputer.x"], + "OneHot_ElasticNet": [OneHotEncoder, "Imputer.x"], + "Random Forest": [RandomForestClassifier, "OneHot_RandomForest.x"], + "Elastic Net": [ElasticNetClassifier, "OneHot_ElasticNet.x"], + "Logistic Regression": [ + LogisticRegressionClassifier, + "Random Forest", + "Elastic Net", + ], + } return graph @@ -97,201 +103,313 @@ def test_init(example_graph): comp_graph = ComponentGraph(graph) assert len(comp_graph.component_dict) == 6 - expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] + expected_order = [ + "Imputer", + "OneHot_ElasticNet", + "Elastic Net", + "OneHot_RandomForest", + "Random Forest", + "Logistic Regression", + ] assert comp_graph.compute_order == expected_order def test_init_str_components(): - graph = {'Imputer': ['Imputer'], - 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], - 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], - 'Elastic Net': ['Elastic Net Classifier', 'OneHot_ElasticNet.x'], - 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net']} + graph = { + "Imputer": ["Imputer"], + "OneHot_RandomForest": ["One Hot Encoder", "Imputer.x"], + "OneHot_ElasticNet": ["One Hot Encoder", "Imputer.x"], + "Random Forest": ["Random Forest Classifier", "OneHot_RandomForest.x"], + "Elastic Net": ["Elastic Net Classifier", "OneHot_ElasticNet.x"], + "Logistic Regression": [ + "Logistic Regression Classifier", + "Random Forest", + "Elastic Net", + ], + } comp_graph = ComponentGraph(graph) assert len(comp_graph.component_dict) == 6 - expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] + expected_order = [ + "Imputer", + "OneHot_ElasticNet", + "Elastic Net", + "OneHot_RandomForest", + "Random Forest", + "Logistic Regression", + ] assert comp_graph.compute_order == expected_order def test_invalid_init(): - invalid_graph = {'Imputer': [Imputer], 'OHE': OneHotEncoder} - with pytest.raises(ValueError, match='All component information should be passed in as a list'): + invalid_graph = {"Imputer": [Imputer], "OHE": OneHotEncoder} + with pytest.raises( + ValueError, match="All component information should be passed in as a list" + ): ComponentGraph(invalid_graph) - with pytest.raises(ValueError, match='may only contain str or ComponentBase subclasses'): - ComponentGraph({'Imputer': [Imputer(numeric_impute_strategy="most_frequent")], 'OneHot': [OneHotEncoder]}) + with pytest.raises( + ValueError, match="may only contain str or ComponentBase subclasses" + ): + ComponentGraph( + { + "Imputer": [Imputer(numeric_impute_strategy="most_frequent")], + "OneHot": [OneHotEncoder], + } + ) - graph = {'Imputer': [Imputer(numeric_impute_strategy='constant', numeric_fill_value=0)]} - with pytest.raises(ValueError, match='may only contain str or ComponentBase subclasses'): + graph = { + "Imputer": [Imputer(numeric_impute_strategy="constant", numeric_fill_value=0)] + } + with pytest.raises( + ValueError, match="may only contain str or ComponentBase subclasses" + ): ComponentGraph(graph) - graph = {'Imputer': ['Imputer', 'Fake'], - 'Fake': ['Fake Component', 'Estimator'], - 'Estimator': [ElasticNetClassifier]} + graph = { + "Imputer": ["Imputer", "Fake"], + "Fake": ["Fake Component", "Estimator"], + "Estimator": [ElasticNetClassifier], + } with pytest.raises(MissingComponentError): ComponentGraph(graph) def test_init_bad_graphs(): - graph = {'Imputer': [Imputer], - 'OHE': [OneHotEncoder, 'Imputer.x', 'Estimator'], - 'Estimator': [RandomForestClassifier, 'OHE.x']} - with pytest.raises(ValueError, match='given graph contains a cycle'): + graph = { + "Imputer": [Imputer], + "OHE": [OneHotEncoder, "Imputer.x", "Estimator"], + "Estimator": [RandomForestClassifier, "OHE.x"], + } + with pytest.raises(ValueError, match="given graph contains a cycle"): ComponentGraph(graph) - graph = {'Imputer': [Imputer], - 'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'], - 'OneHot_ElasticNet': [OneHotEncoder, 'Imputer.x'], - 'Random Forest': [RandomForestClassifier], - 'Elastic Net': [ElasticNetClassifier], - 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']} - with pytest.raises(ValueError, match='graph is not completely connected'): + graph = { + "Imputer": [Imputer], + "OneHot_RandomForest": [OneHotEncoder, "Imputer.x"], + "OneHot_ElasticNet": [OneHotEncoder, "Imputer.x"], + "Random Forest": [RandomForestClassifier], + "Elastic Net": [ElasticNetClassifier], + "Logistic Regression": [ + LogisticRegressionClassifier, + "Random Forest", + "Elastic Net", + ], + } + with pytest.raises(ValueError, match="graph is not completely connected"): ComponentGraph(graph) - graph = {'Imputer': ['Imputer'], - 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], - 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], - 'Elastic Net': ['Elastic Net Classifier'], - 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net']} - with pytest.raises(ValueError, match='graph has more than one final'): + graph = { + "Imputer": ["Imputer"], + "OneHot_RandomForest": ["One Hot Encoder", "Imputer.x"], + "OneHot_ElasticNet": ["One Hot Encoder", "Imputer.x"], + "Random Forest": ["Random Forest Classifier", "OneHot_RandomForest.x"], + "Elastic Net": ["Elastic Net Classifier"], + "Logistic Regression": [ + "Logistic Regression Classifier", + "Random Forest", + "Elastic Net", + ], + } + with pytest.raises(ValueError, match="graph has more than one final"): ComponentGraph(graph) def test_order_x_and_y(): - graph = {'Imputer': [Imputer], - 'OHE': [OneHotEncoder, 'Imputer.x', 'Imputer.y'], - 'Random Forest': [RandomForestClassifier, 'OHE.x']} + graph = { + "Imputer": [Imputer], + "OHE": [OneHotEncoder, "Imputer.x", "Imputer.y"], + "Random Forest": [RandomForestClassifier, "OHE.x"], + } component_graph = ComponentGraph(graph).instantiate({}) - assert component_graph.compute_order == ['Imputer', 'OHE', 'Random Forest'] + assert component_graph.compute_order == ["Imputer", "OHE", "Random Forest"] def test_from_list(): - component_list = ['Imputer', 'One Hot Encoder', RandomForestClassifier] + component_list = ["Imputer", "One Hot Encoder", RandomForestClassifier] component_graph = ComponentGraph.from_list(component_list) assert len(component_graph.component_dict) == 3 - assert component_graph.get_component('Imputer') == Imputer - assert component_graph.get_component('One Hot Encoder') == OneHotEncoder - assert component_graph.get_component('Random Forest Classifier') == RandomForestClassifier - - expected_order = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier'] + assert component_graph.get_component("Imputer") == Imputer + assert component_graph.get_component("One Hot Encoder") == OneHotEncoder + assert ( + component_graph.get_component("Random Forest Classifier") + == RandomForestClassifier + ) + + expected_order = ["Imputer", "One Hot Encoder", "Random Forest Classifier"] assert component_graph.compute_order == expected_order assert component_graph.component_dict == { - 'Imputer': [Imputer], - 'One Hot Encoder': [OneHotEncoder, 'Imputer.x'], - 'Random Forest Classifier': [RandomForestClassifier, 'One Hot Encoder.x'] + "Imputer": [Imputer], + "One Hot Encoder": [OneHotEncoder, "Imputer.x"], + "Random Forest Classifier": [RandomForestClassifier, "One Hot Encoder.x"], } - bad_component_list = ['Imputer', 'Fake Estimator'] - with pytest.raises(MissingComponentError, match='was not found'): + bad_component_list = ["Imputer", "Fake Estimator"] + with pytest.raises(MissingComponentError, match="was not found"): ComponentGraph.from_list(bad_component_list) def test_from_list_repeat_component(): - component_list = ['Imputer', 'One Hot Encoder', 'One Hot Encoder', RandomForestClassifier] + component_list = [ + "Imputer", + "One Hot Encoder", + "One Hot Encoder", + RandomForestClassifier, + ] component_graph = ComponentGraph.from_list(component_list) - expected_order = ['Imputer', 'One Hot Encoder', 'One Hot Encoder_2', 'Random Forest Classifier'] + expected_order = [ + "Imputer", + "One Hot Encoder", + "One Hot Encoder_2", + "Random Forest Classifier", + ] assert component_graph.compute_order == expected_order - component_graph.instantiate({'One Hot Encoder': {'top_n': 2}, - 'One Hot Encoder_2': {'top_n': 11}}) - assert component_graph.get_component('One Hot Encoder').parameters['top_n'] == 2 - assert component_graph.get_component('One Hot Encoder_2').parameters['top_n'] == 11 + component_graph.instantiate( + {"One Hot Encoder": {"top_n": 2}, "One Hot Encoder_2": {"top_n": 11}} + ) + assert component_graph.get_component("One Hot Encoder").parameters["top_n"] == 2 + assert component_graph.get_component("One Hot Encoder_2").parameters["top_n"] == 11 def test_instantiate_with_parameters(example_graph): graph = example_graph component_graph = ComponentGraph(graph) - assert not isinstance(component_graph.get_component('Imputer'), Imputer) - assert not isinstance(component_graph.get_component('Elastic Net'), ElasticNetClassifier) + assert not isinstance(component_graph.get_component("Imputer"), Imputer) + assert not isinstance( + component_graph.get_component("Elastic Net"), ElasticNetClassifier + ) - parameters = {'OneHot_RandomForest': {'top_n': 3}, - 'OneHot_ElasticNet': {'top_n': 5}, - 'Elastic Net': {'max_iter': 100}} + parameters = { + "OneHot_RandomForest": {"top_n": 3}, + "OneHot_ElasticNet": {"top_n": 5}, + "Elastic Net": {"max_iter": 100}, + } component_graph.instantiate(parameters) - expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] + expected_order = [ + "Imputer", + "OneHot_ElasticNet", + "Elastic Net", + "OneHot_RandomForest", + "Random Forest", + "Logistic Regression", + ] assert component_graph.compute_order == expected_order - assert isinstance(component_graph.get_component('Imputer'), Imputer) - assert isinstance(component_graph.get_component('Random Forest'), RandomForestClassifier) - assert isinstance(component_graph.get_component('Logistic Regression'), LogisticRegressionClassifier) - assert component_graph.get_component('OneHot_RandomForest').parameters['top_n'] == 3 - assert component_graph.get_component('OneHot_ElasticNet').parameters['top_n'] == 5 - assert component_graph.get_component('Elastic Net').parameters['max_iter'] == 100 + assert isinstance(component_graph.get_component("Imputer"), Imputer) + assert isinstance( + component_graph.get_component("Random Forest"), RandomForestClassifier + ) + assert isinstance( + component_graph.get_component("Logistic Regression"), + LogisticRegressionClassifier, + ) + assert component_graph.get_component("OneHot_RandomForest").parameters["top_n"] == 3 + assert component_graph.get_component("OneHot_ElasticNet").parameters["top_n"] == 5 + assert component_graph.get_component("Elastic Net").parameters["max_iter"] == 100 def test_instantiate_without_parameters(example_graph): graph = example_graph component_graph = ComponentGraph(graph) component_graph.instantiate({}) - assert component_graph.get_component('OneHot_RandomForest').parameters['top_n'] == 10 - assert component_graph.get_component('OneHot_ElasticNet').parameters['top_n'] == 10 - assert component_graph.get_component('OneHot_RandomForest') is not component_graph.get_component('OneHot_ElasticNet') - - expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] + assert ( + component_graph.get_component("OneHot_RandomForest").parameters["top_n"] == 10 + ) + assert component_graph.get_component("OneHot_ElasticNet").parameters["top_n"] == 10 + assert component_graph.get_component( + "OneHot_RandomForest" + ) is not component_graph.get_component("OneHot_ElasticNet") + + expected_order = [ + "Imputer", + "OneHot_ElasticNet", + "Elastic Net", + "OneHot_RandomForest", + "Random Forest", + "Logistic Regression", + ] assert component_graph.compute_order == expected_order def test_instantiate_from_list(): - component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier'] + component_list = ["Imputer", "One Hot Encoder", "Random Forest Classifier"] component_graph = ComponentGraph().from_list(component_list) - parameters = {'One Hot Encoder': {'top_n': 7}} + parameters = {"One Hot Encoder": {"top_n": 7}} component_graph.instantiate(parameters) - assert isinstance(component_graph.get_component('Imputer'), Imputer) - assert isinstance(component_graph.get_component('Random Forest Classifier'), RandomForestClassifier) - assert component_graph.get_component('One Hot Encoder').parameters['top_n'] == 7 + assert isinstance(component_graph.get_component("Imputer"), Imputer) + assert isinstance( + component_graph.get_component("Random Forest Classifier"), + RandomForestClassifier, + ) + assert component_graph.get_component("One Hot Encoder").parameters["top_n"] == 7 def test_reinstantiate(example_graph): component_graph = ComponentGraph(example_graph) component_graph.instantiate({}) - with pytest.raises(ValueError, match='Cannot reinstantiate a component graph'): - component_graph.instantiate({'OneHot': {'top_n': 7}}) + with pytest.raises(ValueError, match="Cannot reinstantiate a component graph"): + component_graph.instantiate({"OneHot": {"top_n": 7}}) def test_bad_instantiate_can_reinstantiate(example_graph): component_graph = ComponentGraph(example_graph) - with pytest.raises(ValueError, match='Error received when instantiating component'): - component_graph.instantiate(parameters={'Elastic Net': {'max_iter': 100, 'fake_param': None}}) + with pytest.raises(ValueError, match="Error received when instantiating component"): + component_graph.instantiate( + parameters={"Elastic Net": {"max_iter": 100, "fake_param": None}} + ) - component_graph.instantiate({'Elastic Net': {'max_iter': 22}}) - assert component_graph.get_component('Elastic Net').parameters['max_iter'] == 22 + component_graph.instantiate({"Elastic Net": {"max_iter": 22}}) + assert component_graph.get_component("Elastic Net").parameters["max_iter"] == 22 def test_get_component(example_graph): graph = example_graph component_graph = ComponentGraph(graph) - assert component_graph.get_component('OneHot_ElasticNet') == OneHotEncoder - assert component_graph.get_component('Logistic Regression') == LogisticRegressionClassifier - - with pytest.raises(ValueError, match='not in the graph'): - component_graph.get_component('Fake Component') - - component_graph.instantiate({'OneHot_RandomForest': {'top_n': 3}, - 'Random Forest': {'max_depth': 4, 'n_estimators': 50}}) - assert component_graph.get_component('OneHot_ElasticNet') == OneHotEncoder() - assert component_graph.get_component('OneHot_RandomForest') == OneHotEncoder(top_n=3) - assert component_graph.get_component('Random Forest') == RandomForestClassifier(n_estimators=50, max_depth=4) + assert component_graph.get_component("OneHot_ElasticNet") == OneHotEncoder + assert ( + component_graph.get_component("Logistic Regression") + == LogisticRegressionClassifier + ) + + with pytest.raises(ValueError, match="not in the graph"): + component_graph.get_component("Fake Component") + + component_graph.instantiate( + { + "OneHot_RandomForest": {"top_n": 3}, + "Random Forest": {"max_depth": 4, "n_estimators": 50}, + } + ) + assert component_graph.get_component("OneHot_ElasticNet") == OneHotEncoder() + assert component_graph.get_component("OneHot_RandomForest") == OneHotEncoder( + top_n=3 + ) + assert component_graph.get_component("Random Forest") == RandomForestClassifier( + n_estimators=50, max_depth=4 + ) def test_get_estimators(example_graph): component_graph = ComponentGraph(example_graph) - with pytest.raises(ValueError, match='Cannot get estimators until'): + with pytest.raises(ValueError, match="Cannot get estimators until"): component_graph.get_estimators() component_graph.instantiate({}) - assert component_graph.get_estimators() == [RandomForestClassifier(), ElasticNetClassifier(), LogisticRegressionClassifier()] + assert component_graph.get_estimators() == [ + RandomForestClassifier(), + ElasticNetClassifier(), + LogisticRegressionClassifier(), + ] - component_graph = ComponentGraph.from_list(['Imputer', 'One Hot Encoder']) + component_graph = ComponentGraph.from_list(["Imputer", "One Hot Encoder"]) component_graph.instantiate({}) assert component_graph.get_estimators() == [] @@ -300,31 +418,39 @@ def test_parents(example_graph): graph = example_graph component_graph = ComponentGraph(graph) - assert component_graph.get_parents('Imputer') == [] - assert component_graph.get_parents('OneHot_RandomForest') == ['Imputer.x'] - assert component_graph.get_parents('OneHot_ElasticNet') == ['Imputer.x'] - assert component_graph.get_parents('Random Forest') == ['OneHot_RandomForest.x'] - assert component_graph.get_parents('Elastic Net') == ['OneHot_ElasticNet.x'] - assert component_graph.get_parents('Logistic Regression') == ['Random Forest', 'Elastic Net'] + assert component_graph.get_parents("Imputer") == [] + assert component_graph.get_parents("OneHot_RandomForest") == ["Imputer.x"] + assert component_graph.get_parents("OneHot_ElasticNet") == ["Imputer.x"] + assert component_graph.get_parents("Random Forest") == ["OneHot_RandomForest.x"] + assert component_graph.get_parents("Elastic Net") == ["OneHot_ElasticNet.x"] + assert component_graph.get_parents("Logistic Regression") == [ + "Random Forest", + "Elastic Net", + ] - with pytest.raises(ValueError, match='not in the graph'): - component_graph.get_parents('Fake component') + with pytest.raises(ValueError, match="not in the graph"): + component_graph.get_parents("Fake component") component_graph.instantiate({}) - assert component_graph.get_parents('Imputer') == [] - assert component_graph.get_parents('OneHot_RandomForest') == ['Imputer.x'] - assert component_graph.get_parents('OneHot_ElasticNet') == ['Imputer.x'] - assert component_graph.get_parents('Random Forest') == ['OneHot_RandomForest.x'] - assert component_graph.get_parents('Elastic Net') == ['OneHot_ElasticNet.x'] - assert component_graph.get_parents('Logistic Regression') == ['Random Forest', 'Elastic Net'] + assert component_graph.get_parents("Imputer") == [] + assert component_graph.get_parents("OneHot_RandomForest") == ["Imputer.x"] + assert component_graph.get_parents("OneHot_ElasticNet") == ["Imputer.x"] + assert component_graph.get_parents("Random Forest") == ["OneHot_RandomForest.x"] + assert component_graph.get_parents("Elastic Net") == ["OneHot_ElasticNet.x"] + assert component_graph.get_parents("Logistic Regression") == [ + "Random Forest", + "Elastic Net", + ] - with pytest.raises(ValueError, match='not in the graph'): - component_graph.get_parents('Fake component') + with pytest.raises(ValueError, match="not in the graph"): + component_graph.get_parents("Fake component") def test_get_last_component(example_graph): component_graph = ComponentGraph() - with pytest.raises(ValueError, match='Cannot get last component from edgeless graph'): + with pytest.raises( + ValueError, match="Cannot get last component from edgeless graph" + ): component_graph.get_last_component() component_graph = ComponentGraph(example_graph) @@ -333,20 +459,24 @@ def test_get_last_component(example_graph): component_graph.instantiate({}) assert component_graph.get_last_component() == LogisticRegressionClassifier() - component_graph = ComponentGraph({'Imputer': [Imputer]}) + component_graph = ComponentGraph({"Imputer": [Imputer]}) assert component_graph.get_last_component() == Imputer - component_graph = ComponentGraph({'Imputer': [Imputer], 'OneHot': [OneHotEncoder, 'Imputer']}) + component_graph = ComponentGraph( + {"Imputer": [Imputer], "OneHot": [OneHotEncoder, "Imputer"]} + ) assert component_graph.get_last_component() == OneHotEncoder - component_graph = ComponentGraph({'Imputer': [Imputer], 'OneHot': [OneHotEncoder]}) - with pytest.raises(ValueError, match='Cannot get last component from edgeless graph'): + component_graph = ComponentGraph({"Imputer": [Imputer], "OneHot": [OneHotEncoder]}) + with pytest.raises( + ValueError, match="Cannot get last component from edgeless graph" + ): component_graph.get_last_component() -@patch('evalml.pipelines.components.Transformer.fit_transform') -@patch('evalml.pipelines.components.Estimator.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.components.Transformer.fit_transform") +@patch("evalml.pipelines.components.Estimator.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_fit(mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_binary): X, y = X_y_binary mock_fit_transform.return_value = pd.DataFrame(X) @@ -359,13 +489,15 @@ def test_fit(mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_bina assert mock_predict.call_count == 2 -@patch('evalml.pipelines.components.Imputer.fit_transform') -@patch('evalml.pipelines.components.OneHotEncoder.fit_transform') -def test_fit_correct_inputs(mock_ohe_fit_transform, mock_imputer_fit_transform, X_y_binary): +@patch("evalml.pipelines.components.Imputer.fit_transform") +@patch("evalml.pipelines.components.OneHotEncoder.fit_transform") +def test_fit_correct_inputs( + mock_ohe_fit_transform, mock_imputer_fit_transform, X_y_binary +): X, y = X_y_binary X = pd.DataFrame(X) y = pd.Series(y) - graph = {'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x', 'Imputer.y']} + graph = {"Imputer": [Imputer], "OHE": [OneHotEncoder, "Imputer.x", "Imputer.y"]} expected_x = pd.DataFrame(index=X.index, columns=X.index).fillna(1) expected_y = pd.Series(index=y.index).fillna(0) mock_imputer_fit_transform.return_value = tuple((expected_x, expected_y)) @@ -377,12 +509,12 @@ def test_fit_correct_inputs(mock_ohe_fit_transform, mock_imputer_fit_transform, assert_series_equal(expected_y, mock_ohe_fit_transform.call_args[0][1]) -@patch('evalml.pipelines.components.Transformer.fit_transform') -@patch('evalml.pipelines.components.Estimator.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.components.Transformer.fit_transform") +@patch("evalml.pipelines.components.Estimator.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_fit_features(mock_predict, mock_fit, mock_fit_transform, X_y_binary): X, y = X_y_binary - component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier'] + component_list = ["Imputer", "One Hot Encoder", "Random Forest Classifier"] component_graph = ComponentGraph.from_list(component_list) component_graph.instantiate({}) @@ -397,10 +529,12 @@ def test_fit_features(mock_predict, mock_fit, mock_fit_transform, X_y_binary): assert mock_predict.call_count == 0 -@patch('evalml.pipelines.components.Transformer.fit_transform') -@patch('evalml.pipelines.components.Estimator.fit') -@patch('evalml.pipelines.components.Estimator.predict') -def test_fit_features_nonlinear(mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_binary): +@patch("evalml.pipelines.components.Transformer.fit_transform") +@patch("evalml.pipelines.components.Estimator.fit") +@patch("evalml.pipelines.components.Estimator.predict") +def test_fit_features_nonlinear( + mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_binary +): X, y = X_y_binary component_graph = ComponentGraph(example_graph) component_graph.instantiate({}) @@ -417,8 +551,8 @@ def test_fit_features_nonlinear(mock_predict, mock_fit, mock_fit_transform, exam assert mock_predict.call_count == 2 -@patch('evalml.pipelines.components.Estimator.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.components.Estimator.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_predict(mock_predict, mock_fit, example_graph, X_y_binary): X, y = X_y_binary mock_predict.return_value = pd.Series(y) @@ -426,35 +560,46 @@ def test_predict(mock_predict, mock_fit, example_graph, X_y_binary): component_graph.fit(X, y) component_graph.predict(X) - assert mock_predict.call_count == 5 # Called twice when fitting pipeline, thrice when predicting + assert ( + mock_predict.call_count == 5 + ) # Called twice when fitting pipeline, thrice when predicting assert mock_fit.call_count == 3 # Only called during fit, not predict -@patch('evalml.pipelines.components.Estimator.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.components.Estimator.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_predict_repeat_estimator(mock_predict, mock_fit, X_y_binary): X, y = X_y_binary mock_predict.return_value = pd.Series(y) - graph = {'Imputer': [Imputer], - 'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'], - 'OneHot_Logistic': [OneHotEncoder, 'Imputer.x'], - 'Random Forest': [RandomForestClassifier, 'OneHot_RandomForest.x'], - 'Logistic Regression': [LogisticRegressionClassifier, 'OneHot_Logistic.x'], - 'Final Estimator': [LogisticRegressionClassifier, 'Random Forest', 'Logistic Regression']} + graph = { + "Imputer": [Imputer], + "OneHot_RandomForest": [OneHotEncoder, "Imputer.x"], + "OneHot_Logistic": [OneHotEncoder, "Imputer.x"], + "Random Forest": [RandomForestClassifier, "OneHot_RandomForest.x"], + "Logistic Regression": [LogisticRegressionClassifier, "OneHot_Logistic.x"], + "Final Estimator": [ + LogisticRegressionClassifier, + "Random Forest", + "Logistic Regression", + ], + } component_graph = ComponentGraph(graph) component_graph.instantiate({}) component_graph.fit(X, y) - assert not component_graph.get_component('Logistic Regression')._component_obj == component_graph.get_component('Final Estimator')._component_obj + assert ( + not component_graph.get_component("Logistic Regression")._component_obj + == component_graph.get_component("Final Estimator")._component_obj + ) component_graph.predict(X) assert mock_predict.call_count == 5 assert mock_fit.call_count == 3 -@patch('evalml.pipelines.components.Imputer.transform') -@patch('evalml.pipelines.components.OneHotEncoder.transform') +@patch("evalml.pipelines.components.Imputer.transform") +@patch("evalml.pipelines.components.OneHotEncoder.transform") def test_compute_final_component_features_linear(mock_ohe, mock_imputer, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) @@ -462,7 +607,7 @@ def test_compute_final_component_features_linear(mock_ohe, mock_imputer, X_y_bin mock_imputer.return_value = X mock_ohe.return_value = X_expected - component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier'] + component_list = ["Imputer", "One Hot Encoder", "Random Forest Classifier"] component_graph = ComponentGraph().from_list(component_list) component_graph.instantiate({}) component_graph.fit(X, y) @@ -473,17 +618,21 @@ def test_compute_final_component_features_linear(mock_ohe, mock_imputer, X_y_bin assert mock_ohe.call_count == 2 -@patch('evalml.pipelines.components.Imputer.transform') -@patch('evalml.pipelines.components.OneHotEncoder.transform') -@patch('evalml.pipelines.components.RandomForestClassifier.predict') -@patch('evalml.pipelines.components.ElasticNetClassifier.predict') -def test_compute_final_component_features_nonlinear(mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, example_graph, X_y_binary): +@patch("evalml.pipelines.components.Imputer.transform") +@patch("evalml.pipelines.components.OneHotEncoder.transform") +@patch("evalml.pipelines.components.RandomForestClassifier.predict") +@patch("evalml.pipelines.components.ElasticNetClassifier.predict") +def test_compute_final_component_features_nonlinear( + mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, example_graph, X_y_binary +): X, y = X_y_binary mock_imputer.return_value = pd.DataFrame(X) mock_ohe.return_value = pd.DataFrame(X) mock_en_predict.return_value = pd.Series(np.ones(X.shape[0])) mock_rf_predict.return_value = pd.Series(np.zeros(X.shape[0])) - X_expected = pd.DataFrame({'Random Forest': np.zeros(X.shape[0]), 'Elastic Net': np.ones(X.shape[0])}) + X_expected = pd.DataFrame( + {"Random Forest": np.zeros(X.shape[0]), "Elastic Net": np.ones(X.shape[0])} + ) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) @@ -493,24 +642,28 @@ def test_compute_final_component_features_nonlinear(mock_en_predict, mock_rf_pre assert mock_ohe.call_count == 4 -@patch(f'{__name__}.DummyTransformer.transform') +@patch(f"{__name__}.DummyTransformer.transform") def test_compute_final_component_features_single_component(mock_transform, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) mock_transform.return_value = X - component_graph = ComponentGraph({'Dummy Component': [DummyTransformer]}).instantiate({}) + component_graph = ComponentGraph( + {"Dummy Component": [DummyTransformer]} + ).instantiate({}) component_graph.fit(X, y) X_t = component_graph.compute_final_component_features(X) assert_frame_equal(X, X_t) -@patch('evalml.pipelines.components.Imputer.fit_transform') +@patch("evalml.pipelines.components.Imputer.fit_transform") def test_fit_y_parent(mock_fit_transform, X_y_binary): X, y = X_y_binary - graph = {'Imputer': [Imputer], - 'OHE': [OneHotEncoder, 'Imputer.x', 'Imputer.y'], - 'Random Forest': [RandomForestClassifier, 'OHE.x']} + graph = { + "Imputer": [Imputer], + "OHE": [OneHotEncoder, "Imputer.x", "Imputer.y"], + "Random Forest": [RandomForestClassifier, "OHE.x"], + } component_graph = ComponentGraph(graph).instantiate({}) mock_fit_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y))) @@ -529,11 +682,11 @@ def test_predict_empty_graph(X_y_binary): assert_frame_equal(X, X_t) -@patch('evalml.pipelines.components.OneHotEncoder.fit_transform') -@patch('evalml.pipelines.components.OneHotEncoder.transform') +@patch("evalml.pipelines.components.OneHotEncoder.fit_transform") +@patch("evalml.pipelines.components.OneHotEncoder.transform") def test_predict_transformer_end(mock_fit_transform, mock_transform, X_y_binary): X, y = X_y_binary - graph = {'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x']} + graph = {"Imputer": [Imputer], "OHE": [OneHotEncoder, "Imputer.x"]} component_graph = ComponentGraph(graph).instantiate({}) mock_fit_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y))) mock_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y))) @@ -545,49 +698,78 @@ def test_predict_transformer_end(mock_fit_transform, mock_transform, X_y_binary) def test_no_instantiate_before_fit(X_y_binary): X, y = X_y_binary - graph = {'Imputer': [Imputer], - 'OHE': [OneHotEncoder, 'Imputer.x'], - 'Estimator': [RandomForestClassifier, 'OHE.x']} + graph = { + "Imputer": [Imputer], + "OHE": [OneHotEncoder, "Imputer.x"], + "Estimator": [RandomForestClassifier, "OHE.x"], + } component_graph = ComponentGraph(graph) - with pytest.raises(ValueError, match='All components must be instantiated before fitting or predicting'): + with pytest.raises( + ValueError, + match="All components must be instantiated before fitting or predicting", + ): component_graph.fit(X, y) -@patch('evalml.pipelines.components.Imputer.fit_transform') +@patch("evalml.pipelines.components.Imputer.fit_transform") def test_multiple_y_parents(mock_fit_transform, X_y_binary): X, y = X_y_binary - graph = {'Imputer': [Imputer], - 'OHE': [OneHotEncoder, 'Imputer.x'], - 'Estimator': [RandomForestClassifier, 'Imputer.y', 'OHE.y']} + graph = { + "Imputer": [Imputer], + "OHE": [OneHotEncoder, "Imputer.x"], + "Estimator": [RandomForestClassifier, "Imputer.y", "OHE.y"], + } component_graph = ComponentGraph(graph) component_graph.instantiate({}) mock_fit_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y))) - with pytest.raises(ValueError, match='Cannot have multiple `y` parents for a single component'): + with pytest.raises( + ValueError, match="Cannot have multiple `y` parents for a single component" + ): component_graph.fit(X, y) def test_component_graph_order(example_graph): component_graph = ComponentGraph(example_graph) - expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] + expected_order = [ + "Imputer", + "OneHot_ElasticNet", + "Elastic Net", + "OneHot_RandomForest", + "Random Forest", + "Logistic Regression", + ] assert expected_order == component_graph.compute_order - component_graph = ComponentGraph({'Imputer': [Imputer]}) - expected_order = ['Imputer'] + component_graph = ComponentGraph({"Imputer": [Imputer]}) + expected_order = ["Imputer"] assert expected_order == component_graph.compute_order -@pytest.mark.parametrize("index", [list(range(-5, 0)), - list(range(100, 105)), - [f"row_{i}" for i in range(5)], - pd.date_range("2020-09-08", periods=5)]) +@pytest.mark.parametrize( + "index", + [ + list(range(-5, 0)), + list(range(100, 105)), + [f"row_{i}" for i in range(5)], + pd.date_range("2020-09-08", periods=5), + ], +) def test_computation_input_custom_index(index): - graph = {'OneHot': [OneHotEncoder], - 'Random Forest': [RandomForestClassifier, 'OneHot.x'], - 'Elastic Net': [ElasticNetClassifier, 'OneHot.x'], - 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']} + graph = { + "OneHot": [OneHotEncoder], + "Random Forest": [RandomForestClassifier, "OneHot.x"], + "Elastic Net": [ElasticNetClassifier, "OneHot.x"], + "Logistic Regression": [ + LogisticRegressionClassifier, + "Random Forest", + "Elastic Net", + ], + } - X = pd.DataFrame({"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)}, - index=index) + X = pd.DataFrame( + {"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)}, + index=index, + ) y = pd.Series([1, 2, 1, 2, 1]) component_graph = ComponentGraph(graph) component_graph.instantiate({}) @@ -598,125 +780,246 @@ def test_computation_input_custom_index(index): assert not X_t.isna().any(axis=None) -@patch(f'{__name__}.EstimatorC.predict') -@patch(f'{__name__}.EstimatorB.predict') -@patch(f'{__name__}.EstimatorA.predict') -@patch(f'{__name__}.TransformerC.transform') -@patch(f'{__name__}.TransformerB.transform') -@patch(f'{__name__}.TransformerA.transform') -def test_component_graph_evaluation_plumbing(mock_transa, mock_transb, mock_transc, mock_preda, mock_predb, mock_predc, dummy_components): - TransformerA, TransformerB, TransformerC, EstimatorA, EstimatorB, EstimatorC = dummy_components - mock_transa.return_value = pd.DataFrame({'feature trans': [1, 0, 0, 0, 0, 0], 'feature a': np.ones(6)}) - mock_transb.return_value = pd.DataFrame({'feature b': np.ones(6) * 2}) - mock_transc.return_value = pd.DataFrame({'feature c': np.ones(6) * 3}) +@patch(f"{__name__}.EstimatorC.predict") +@patch(f"{__name__}.EstimatorB.predict") +@patch(f"{__name__}.EstimatorA.predict") +@patch(f"{__name__}.TransformerC.transform") +@patch(f"{__name__}.TransformerB.transform") +@patch(f"{__name__}.TransformerA.transform") +def test_component_graph_evaluation_plumbing( + mock_transa, + mock_transb, + mock_transc, + mock_preda, + mock_predb, + mock_predc, + dummy_components, +): + ( + TransformerA, + TransformerB, + TransformerC, + EstimatorA, + EstimatorB, + EstimatorC, + ) = dummy_components + mock_transa.return_value = pd.DataFrame( + {"feature trans": [1, 0, 0, 0, 0, 0], "feature a": np.ones(6)} + ) + mock_transb.return_value = pd.DataFrame({"feature b": np.ones(6) * 2}) + mock_transc.return_value = pd.DataFrame({"feature c": np.ones(6) * 3}) mock_preda.return_value = pd.Series([0, 0, 0, 1, 0, 0]) mock_predb.return_value = pd.Series([0, 0, 0, 0, 1, 0]) mock_predc.return_value = pd.Series([0, 0, 0, 0, 0, 1]) graph = { - 'transformer a': [TransformerA], - 'transformer b': [TransformerB, 'transformer a'], - 'transformer c': [TransformerC, 'transformer a', 'transformer b'], - 'estimator a': [EstimatorA], - 'estimator b': [EstimatorB, 'transformer a'], - 'estimator c': [EstimatorC, 'transformer a', 'estimator a', 'transformer b', 'estimator b', 'transformer c'] + "transformer a": [TransformerA], + "transformer b": [TransformerB, "transformer a"], + "transformer c": [TransformerC, "transformer a", "transformer b"], + "estimator a": [EstimatorA], + "estimator b": [EstimatorB, "transformer a"], + "estimator c": [ + EstimatorC, + "transformer a", + "estimator a", + "transformer b", + "estimator b", + "transformer c", + ], } component_graph = ComponentGraph(graph) component_graph.instantiate({}) - X = pd.DataFrame({'feature1': np.zeros(6), 'feature2': np.zeros(6)}) + X = pd.DataFrame({"feature1": np.zeros(6), "feature2": np.zeros(6)}) y = pd.Series(np.zeros(6)) component_graph.fit(X, y) predict_out = component_graph.predict(X) assert_frame_equal(mock_transa.call_args[0][0], X) - assert_frame_equal(mock_transb.call_args[0][0], pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), - 'feature a': np.ones(6)}, columns=['feature trans', 'feature a'])) - assert_frame_equal(mock_transc.call_args[0][0], pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), - 'feature a': np.ones(6), - 'feature b': np.ones(6) * 2}, - columns=['feature trans', 'feature a', 'feature b'])) + assert_frame_equal( + mock_transb.call_args[0][0], + pd.DataFrame( + { + "feature trans": pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), + "feature a": np.ones(6), + }, + columns=["feature trans", "feature a"], + ), + ) + assert_frame_equal( + mock_transc.call_args[0][0], + pd.DataFrame( + { + "feature trans": pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), + "feature a": np.ones(6), + "feature b": np.ones(6) * 2, + }, + columns=["feature trans", "feature a", "feature b"], + ), + ) assert_frame_equal(mock_preda.call_args[0][0], X) - assert_frame_equal(mock_predb.call_args[0][0], pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), - 'feature a': np.ones(6)}, - columns=['feature trans', 'feature a'])) - assert_frame_equal(mock_predc.call_args[0][0], pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), - 'feature a': np.ones(6), - 'estimator a': pd.Series([0, 0, 0, 1, 0, 0], dtype="int64"), - 'feature b': np.ones(6) * 2, - 'estimator b': pd.Series([0, 0, 0, 0, 1, 0], dtype="int64"), - 'feature c': np.ones(6) * 3}, - columns=['feature trans', 'feature a', 'estimator a', 'feature b', 'estimator b', 'feature c'])) + assert_frame_equal( + mock_predb.call_args[0][0], + pd.DataFrame( + { + "feature trans": pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), + "feature a": np.ones(6), + }, + columns=["feature trans", "feature a"], + ), + ) + assert_frame_equal( + mock_predc.call_args[0][0], + pd.DataFrame( + { + "feature trans": pd.Series([1, 0, 0, 0, 0, 0], dtype="int64"), + "feature a": np.ones(6), + "estimator a": pd.Series([0, 0, 0, 1, 0, 0], dtype="int64"), + "feature b": np.ones(6) * 2, + "estimator b": pd.Series([0, 0, 0, 0, 1, 0], dtype="int64"), + "feature c": np.ones(6) * 3, + }, + columns=[ + "feature trans", + "feature a", + "estimator a", + "feature b", + "estimator b", + "feature c", + ], + ), + ) assert_series_equal(pd.Series([0, 0, 0, 0, 0, 1], dtype="int64"), predict_out) def test_input_feature_names(example_graph): - X = pd.DataFrame({'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'], - 'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3]}) + X = pd.DataFrame( + { + "column_1": ["a", "b", "c", "d", "a", "a", "b", "c", "b"], + "column_2": [1, 2, 3, 4, 5, 6, 5, 4, 3], + } + ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) component_graph = ComponentGraph(example_graph) - component_graph.instantiate({'OneHot_RandomForest': {'top_n': 2}, - 'OneHot_ElasticNet': {'top_n': 3}}) + component_graph.instantiate( + {"OneHot_RandomForest": {"top_n": 2}, "OneHot_ElasticNet": {"top_n": 3}} + ) assert component_graph.input_feature_names == {} component_graph.fit(X, y) input_feature_names = component_graph.input_feature_names - assert input_feature_names['Imputer'] == ['column_1', 'column_2'] - assert input_feature_names['OneHot_RandomForest'] == ['column_1', 'column_2'] - assert input_feature_names['OneHot_ElasticNet'] == ['column_1', 'column_2'] - assert input_feature_names['Random Forest'] == ['column_2', 'column_1_a', 'column_1_b'] - assert input_feature_names['Elastic Net'] == ['column_2', 'column_1_a', 'column_1_b', 'column_1_c'] - assert input_feature_names['Logistic Regression'] == ['Random Forest', 'Elastic Net'] + assert input_feature_names["Imputer"] == ["column_1", "column_2"] + assert input_feature_names["OneHot_RandomForest"] == ["column_1", "column_2"] + assert input_feature_names["OneHot_ElasticNet"] == ["column_1", "column_2"] + assert input_feature_names["Random Forest"] == [ + "column_2", + "column_1_a", + "column_1_b", + ] + assert input_feature_names["Elastic Net"] == [ + "column_2", + "column_1_a", + "column_1_b", + "column_1_c", + ] + assert input_feature_names["Logistic Regression"] == [ + "Random Forest", + "Elastic Net", + ] def test_iteration(example_graph): component_graph = ComponentGraph(example_graph) - expected = [Imputer, OneHotEncoder, ElasticNetClassifier, OneHotEncoder, RandomForestClassifier, LogisticRegressionClassifier] + expected = [ + Imputer, + OneHotEncoder, + ElasticNetClassifier, + OneHotEncoder, + RandomForestClassifier, + LogisticRegressionClassifier, + ] iteration = [component for component in component_graph] assert iteration == expected - component_graph.instantiate({'OneHot_RandomForest': {'top_n': 32}}) - expected = [Imputer(), OneHotEncoder(), ElasticNetClassifier(), OneHotEncoder(top_n=32), RandomForestClassifier(), LogisticRegressionClassifier()] + component_graph.instantiate({"OneHot_RandomForest": {"top_n": 32}}) + expected = [ + Imputer(), + OneHotEncoder(), + ElasticNetClassifier(), + OneHotEncoder(top_n=32), + RandomForestClassifier(), + LogisticRegressionClassifier(), + ] iteration = [component for component in component_graph] assert iteration == expected def test_custom_input_feature_types(example_graph): - X = pd.DataFrame({'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'], - 'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3]}) + X = pd.DataFrame( + { + "column_1": ["a", "b", "c", "d", "a", "a", "b", "c", "b"], + "column_2": [1, 2, 3, 4, 5, 6, 5, 4, 3], + } + ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) X = infer_feature_types(X, {"column_2": "categorical"}) component_graph = ComponentGraph(example_graph) - component_graph.instantiate({'OneHot_RandomForest': {'top_n': 2}, - 'OneHot_ElasticNet': {'top_n': 3}}) + component_graph.instantiate( + {"OneHot_RandomForest": {"top_n": 2}, "OneHot_ElasticNet": {"top_n": 3}} + ) assert component_graph.input_feature_names == {} component_graph.fit(X, y) input_feature_names = component_graph.input_feature_names - assert input_feature_names['Imputer'] == ['column_1', 'column_2'] - assert input_feature_names['OneHot_RandomForest'] == ['column_1', 'column_2'] - assert input_feature_names['OneHot_ElasticNet'] == ['column_1', 'column_2'] - assert input_feature_names['Random Forest'] == ['column_1_a', 'column_1_b', 'column_2_4', 'column_2_5'] - assert input_feature_names['Elastic Net'] == ['column_1_a', 'column_1_b', 'column_1_c', 'column_2_3', 'column_2_4', 'column_2_5'] - assert input_feature_names['Logistic Regression'] == ['Random Forest', 'Elastic Net'] + assert input_feature_names["Imputer"] == ["column_1", "column_2"] + assert input_feature_names["OneHot_RandomForest"] == ["column_1", "column_2"] + assert input_feature_names["OneHot_ElasticNet"] == ["column_1", "column_2"] + assert input_feature_names["Random Forest"] == [ + "column_1_a", + "column_1_b", + "column_2_4", + "column_2_5", + ] + assert input_feature_names["Elastic Net"] == [ + "column_1_a", + "column_1_b", + "column_1_c", + "column_2_3", + "column_2_4", + "column_2_5", + ] + assert input_feature_names["Logistic Regression"] == [ + "Random Forest", + "Elastic Net", + ] def test_component_graph_dataset_with_different_types(): # Checks that types are converted correctly by Woodwork. Specifically, the standard scaler # should convert column_3 to float, so our code to try to convert back to the original boolean type # will catch the TypeError thrown and not convert the column. - graph = {'Imputer': [Imputer], - 'OneHot': [OneHotEncoder, 'Imputer.x'], - 'DateTime': [DateTimeFeaturizer, 'OneHot.x'], - 'Scaler': [StandardScaler, 'DateTime.x'], - 'Random Forest': [RandomForestClassifier, 'Scaler.x'], - 'Elastic Net': [ElasticNetClassifier, 'Scaler.x'], - 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']} - - X = pd.DataFrame({'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'], - 'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3], - 'column_3': [True, False, True, False, True, False, True, False, False]}) + graph = { + "Imputer": [Imputer], + "OneHot": [OneHotEncoder, "Imputer.x"], + "DateTime": [DateTimeFeaturizer, "OneHot.x"], + "Scaler": [StandardScaler, "DateTime.x"], + "Random Forest": [RandomForestClassifier, "Scaler.x"], + "Elastic Net": [ElasticNetClassifier, "Scaler.x"], + "Logistic Regression": [ + LogisticRegressionClassifier, + "Random Forest", + "Elastic Net", + ], + } + + X = pd.DataFrame( + { + "column_1": ["a", "b", "c", "d", "a", "a", "b", "c", "b"], + "column_2": [1, 2, 3, 4, 5, 6, 5, 4, 3], + "column_3": [True, False, True, False, True, False, True, False, False], + } + ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) X = infer_feature_types(X, {"column_2": "categorical"}) @@ -726,112 +1029,222 @@ def test_component_graph_dataset_with_different_types(): component_graph.fit(X, y) input_feature_names = component_graph.input_feature_names - assert input_feature_names['Imputer'] == ['column_1', 'column_2', 'column_3'] - assert input_feature_names['OneHot'] == ['column_1', 'column_2', 'column_3'] - assert input_feature_names['DateTime'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', - 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] - assert input_feature_names['Scaler'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', - 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] - assert input_feature_names['Random Forest'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', - 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] - assert input_feature_names['Elastic Net'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', - 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] - assert input_feature_names['Logistic Regression'] == ['Random Forest', 'Elastic Net'] + assert input_feature_names["Imputer"] == ["column_1", "column_2", "column_3"] + assert input_feature_names["OneHot"] == ["column_1", "column_2", "column_3"] + assert input_feature_names["DateTime"] == [ + "column_3", + "column_1_a", + "column_1_b", + "column_1_c", + "column_1_d", + "column_2_1", + "column_2_2", + "column_2_3", + "column_2_4", + "column_2_5", + "column_2_6", + ] + assert input_feature_names["Scaler"] == [ + "column_3", + "column_1_a", + "column_1_b", + "column_1_c", + "column_1_d", + "column_2_1", + "column_2_2", + "column_2_3", + "column_2_4", + "column_2_5", + "column_2_6", + ] + assert input_feature_names["Random Forest"] == [ + "column_3", + "column_1_a", + "column_1_b", + "column_1_c", + "column_1_d", + "column_2_1", + "column_2_2", + "column_2_3", + "column_2_4", + "column_2_5", + "column_2_6", + ] + assert input_feature_names["Elastic Net"] == [ + "column_3", + "column_1_a", + "column_1_b", + "column_1_c", + "column_1_d", + "column_2_1", + "column_2_2", + "column_2_3", + "column_2_4", + "column_2_5", + "column_2_6", + ] + assert input_feature_names["Logistic Regression"] == [ + "Random Forest", + "Elastic Net", + ] def test_component_graph_sampler(): - graph = {'Imputer': [Imputer], - 'OneHot': [OneHotEncoder, 'Imputer.x'], - 'Undersampler': [Undersampler, 'OneHot.x'], - 'Random Forest': [RandomForestClassifier, 'Undersampler.x', 'Undersampler.y'], - 'Elastic Net': [ElasticNetClassifier, 'Undersampler.x', 'Undersampler.y'], - 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']} + graph = { + "Imputer": [Imputer], + "OneHot": [OneHotEncoder, "Imputer.x"], + "Undersampler": [Undersampler, "OneHot.x"], + "Random Forest": [RandomForestClassifier, "Undersampler.x", "Undersampler.y"], + "Elastic Net": [ElasticNetClassifier, "Undersampler.x", "Undersampler.y"], + "Logistic Regression": [ + LogisticRegressionClassifier, + "Random Forest", + "Elastic Net", + ], + } component_graph = ComponentGraph(graph) component_graph.instantiate({}) - assert component_graph.get_parents('Imputer') == [] - assert component_graph.get_parents('OneHot') == ['Imputer.x'] - assert component_graph.get_parents('Undersampler') == ['OneHot.x'] - assert component_graph.get_parents('Random Forest') == ['Undersampler.x', 'Undersampler.y'] - assert component_graph.get_parents('Elastic Net') == ['Undersampler.x', 'Undersampler.y'] - assert component_graph.get_parents('Logistic Regression') == ['Random Forest', 'Elastic Net'] + assert component_graph.get_parents("Imputer") == [] + assert component_graph.get_parents("OneHot") == ["Imputer.x"] + assert component_graph.get_parents("Undersampler") == ["OneHot.x"] + assert component_graph.get_parents("Random Forest") == [ + "Undersampler.x", + "Undersampler.y", + ] + assert component_graph.get_parents("Elastic Net") == [ + "Undersampler.x", + "Undersampler.y", + ] + assert component_graph.get_parents("Logistic Regression") == [ + "Random Forest", + "Elastic Net", + ] def test_component_graph_sampler_list(): - component_list = ['Imputer', 'One Hot Encoder', 'Undersampler', 'Random Forest Classifier'] + component_list = [ + "Imputer", + "One Hot Encoder", + "Undersampler", + "Random Forest Classifier", + ] component_graph = ComponentGraph.from_list(component_list) assert len(component_graph.component_dict) == 4 - assert component_graph.get_component('Imputer') == Imputer - assert component_graph.get_component('One Hot Encoder') == OneHotEncoder - assert component_graph.get_component('Undersampler') == Undersampler - assert component_graph.get_component('Random Forest Classifier') == RandomForestClassifier + assert component_graph.get_component("Imputer") == Imputer + assert component_graph.get_component("One Hot Encoder") == OneHotEncoder + assert component_graph.get_component("Undersampler") == Undersampler + assert ( + component_graph.get_component("Random Forest Classifier") + == RandomForestClassifier + ) assert component_graph.compute_order == component_list assert component_graph.component_dict == { - 'Imputer': [Imputer], - 'One Hot Encoder': [OneHotEncoder, 'Imputer.x'], - 'Undersampler': [Undersampler, 'One Hot Encoder.x'], - 'Random Forest Classifier': [RandomForestClassifier, 'Undersampler.x', 'Undersampler.y'] + "Imputer": [Imputer], + "One Hot Encoder": [OneHotEncoder, "Imputer.x"], + "Undersampler": [Undersampler, "One Hot Encoder.x"], + "Random Forest Classifier": [ + RandomForestClassifier, + "Undersampler.x", + "Undersampler.y", + ], } - assert component_graph.get_parents('Imputer') == [] - assert component_graph.get_parents('One Hot Encoder') == ['Imputer.x'] - assert component_graph.get_parents('Undersampler') == ['One Hot Encoder.x'] - assert component_graph.get_parents('Random Forest Classifier') == ['Undersampler.x', 'Undersampler.y'] + assert component_graph.get_parents("Imputer") == [] + assert component_graph.get_parents("One Hot Encoder") == ["Imputer.x"] + assert component_graph.get_parents("Undersampler") == ["One Hot Encoder.x"] + assert component_graph.get_parents("Random Forest Classifier") == [ + "Undersampler.x", + "Undersampler.y", + ] def test_component_graph_dataset_with_target_imputer(): - X = pd.DataFrame({'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'], - 'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3]}) + X = pd.DataFrame( + { + "column_1": ["a", "b", "c", "d", "a", "a", "b", "c", "b"], + "column_2": [1, 2, 3, 4, 5, 6, 5, 4, 3], + } + ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, np.nan]) - graph = {'Target Imputer': [TargetImputer], - 'OneHot': [OneHotEncoder, 'Target Imputer.x', 'Target Imputer.y'], - 'Random Forest': [RandomForestClassifier, 'OneHot.x', 'Target Imputer.y'], - 'Elastic Net': [ElasticNetClassifier, 'OneHot.x', 'Target Imputer.y'], - 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net', 'Target Imputer.y']} + graph = { + "Target Imputer": [TargetImputer], + "OneHot": [OneHotEncoder, "Target Imputer.x", "Target Imputer.y"], + "Random Forest": [RandomForestClassifier, "OneHot.x", "Target Imputer.y"], + "Elastic Net": [ElasticNetClassifier, "OneHot.x", "Target Imputer.y"], + "Logistic Regression": [ + LogisticRegressionClassifier, + "Random Forest", + "Elastic Net", + "Target Imputer.y", + ], + } component_graph = ComponentGraph(graph) component_graph.instantiate({}) - assert component_graph.get_parents('Target Imputer') == [] - assert component_graph.get_parents('OneHot') == ['Target Imputer.x', 'Target Imputer.y'] - assert component_graph.get_parents('Random Forest') == ['OneHot.x', 'Target Imputer.y'] - assert component_graph.get_parents('Elastic Net') == ['OneHot.x', 'Target Imputer.y'] + assert component_graph.get_parents("Target Imputer") == [] + assert component_graph.get_parents("OneHot") == [ + "Target Imputer.x", + "Target Imputer.y", + ] + assert component_graph.get_parents("Random Forest") == [ + "OneHot.x", + "Target Imputer.y", + ] + assert component_graph.get_parents("Elastic Net") == [ + "OneHot.x", + "Target Imputer.y", + ] component_graph.fit(X, y) predictions = component_graph.predict(X) assert not pd.isnull(predictions).any() -@patch('evalml.pipelines.components.estimators.LogisticRegressionClassifier.fit') +@patch("evalml.pipelines.components.estimators.LogisticRegressionClassifier.fit") def test_component_graph_sampler_y_passes(mock_estimator_fit): - pytest.importorskip("imblearn.over_sampling", reason="Cannot import imblearn, skipping tests") + pytest.importorskip( + "imblearn.over_sampling", reason="Cannot import imblearn, skipping tests" + ) # makes sure the y value from oversampler gets passed to the estimator, even though StandardScaler has no y output - X = pd.DataFrame({"a": [i for i in range(100)], - "b": [i % 3 for i in range(100)]}) + X = pd.DataFrame({"a": [i for i in range(100)], "b": [i % 3 for i in range(100)]}) y = pd.Series([0] * 90 + [1] * 10) - component_list = ['Imputer', 'SMOTE Oversampler', 'Standard Scaler', 'Logistic Regression Classifier'] + component_list = [ + "Imputer", + "SMOTE Oversampler", + "Standard Scaler", + "Logistic Regression Classifier", + ] component_graph = ComponentGraph.from_list(component_list) component_graph.instantiate({}) component_graph.fit(X, y) - assert len(mock_estimator_fit.call_args[0][0]) == len(mock_estimator_fit.call_args[0][1]) + assert len(mock_estimator_fit.call_args[0][0]) == len( + mock_estimator_fit.call_args[0][1] + ) assert len(mock_estimator_fit.call_args[0][0]) == int(1.25 * 90) -@patch('evalml.pipelines.components.estimators.RandomForestClassifier.fit') -@patch('evalml.pipelines.components.estimators.DecisionTreeClassifier.fit') +@patch("evalml.pipelines.components.estimators.RandomForestClassifier.fit") +@patch("evalml.pipelines.components.estimators.DecisionTreeClassifier.fit") def test_component_graph_sampler_same_given_components(mock_dt_fit, mock_rf_fit): - pytest.importorskip("imblearn.over_sampling", reason="Cannot import imblearn, skipping tests") - X = pd.DataFrame({"a": [i for i in range(100)], - "b": [i % 3 for i in range(100)]}) + pytest.importorskip( + "imblearn.over_sampling", reason="Cannot import imblearn, skipping tests" + ) + X = pd.DataFrame({"a": [i for i in range(100)], "b": [i % 3 for i in range(100)]}) y = pd.Series([0] * 90 + [1] * 10) - component_list = ['Imputer', 'SMOTE Oversampler', 'Random Forest Classifier'] + component_list = ["Imputer", "SMOTE Oversampler", "Random Forest Classifier"] component_graph = ComponentGraph.from_list(component_list) component_graph.instantiate({}) component_graph.fit(X, y) - component_list2 = ['Imputer', 'SMOTE Oversampler', 'Decision Tree Classifier'] + component_list2 = ["Imputer", "SMOTE Oversampler", "Decision Tree Classifier"] component_graph2 = ComponentGraph.from_list(component_list2) component_graph2.instantiate({}) component_graph2.fit(X, y) - pd.testing.assert_frame_equal(mock_dt_fit.call_args[0][0], mock_rf_fit.call_args[0][0]) - pd.testing.assert_series_equal(mock_dt_fit.call_args[0][1], mock_rf_fit.call_args[0][1]) + pd.testing.assert_frame_equal( + mock_dt_fit.call_args[0][0], mock_rf_fit.call_args[0][0] + ) + pd.testing.assert_series_equal( + mock_dt_fit.call_args[0][1], mock_rf_fit.call_args[0][1] + ) diff --git a/evalml/tests/pipeline_tests/test_graphs.py b/evalml/tests/pipeline_tests/test_graphs.py index e38a810bc2..6698db6ec8 100644 --- a/evalml/tests/pipeline_tests/test_graphs.py +++ b/evalml/tests/pipeline_tests/test_graphs.py @@ -11,10 +11,17 @@ @pytest.fixture def test_pipeline(): class TestPipeline(BinaryClassificationPipeline): - component_graph = ['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier'] + component_graph = [ + "Simple Imputer", + "One Hot Encoder", + "Standard Scaler", + "Logistic Regression Classifier", + ] def __init__(self, parameters, random_seed=0): - super().__init__(component_graph=self.component_graph, parameters=parameters) + super().__init__( + component_graph=self.component_graph, parameters=parameters + ) @property def feature_importance(self): @@ -29,36 +36,48 @@ def feature_importance(self): @pytest.fixture def test_component_graph(): - graph = {'Imputer': ['Imputer'], - 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], - 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], - 'Elastic Net': ['Elastic Net Classifier', 'OneHot_ElasticNet.x'], - 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net']} + graph = { + "Imputer": ["Imputer"], + "OneHot_RandomForest": ["One Hot Encoder", "Imputer.x"], + "OneHot_ElasticNet": ["One Hot Encoder", "Imputer.x"], + "Random Forest": ["Random Forest Classifier", "OneHot_RandomForest.x"], + "Elastic Net": ["Elastic Net Classifier", "OneHot_ElasticNet.x"], + "Logistic Regression": [ + "Logistic Regression Classifier", + "Random Forest", + "Elastic Net", + ], + } component_graph = ComponentGraph(graph) return component_graph def test_backend(test_pipeline): - graphviz = pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') - with patch('graphviz.Digraph.pipe') as mock_func: - mock_func.side_effect = graphviz.backend.ExecutableNotFound('Not Found') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) + with patch("graphviz.Digraph.pipe") as mock_func: + mock_func.side_effect = graphviz.backend.ExecutableNotFound("Not Found") clf = test_pipeline with pytest.raises(RuntimeError): clf.graph() def test_returns_digraph_object(test_pipeline): - graphviz = pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) clf = test_pipeline graph = clf.graph() assert isinstance(graph, graphviz.Digraph) def test_backend_comp_graph(test_component_graph): - graphviz = pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') - with patch('graphviz.Digraph.pipe') as mock_func: - mock_func.side_effect = graphviz.backend.ExecutableNotFound('Not Found') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) + with patch("graphviz.Digraph.pipe") as mock_func: + mock_func.side_effect = graphviz.backend.ExecutableNotFound("Not Found") comp = test_component_graph with pytest.raises(RuntimeError): comp.graph() @@ -67,53 +86,67 @@ def test_backend_comp_graph(test_component_graph): def test_saving_png_file(tmpdir, test_pipeline, is_using_conda): if is_using_conda: pytest.skip("Skipping saving_png_file if running during conda build process.") - pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') - filepath = os.path.join(str(tmpdir), 'pipeline.png') + pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) + filepath = os.path.join(str(tmpdir), "pipeline.png") pipeline = test_pipeline pipeline.graph(filepath=filepath) assert os.path.isfile(filepath) def test_returns_digraph_object_comp_graph(test_component_graph): - graphviz = pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) comp = test_component_graph - graph = comp.graph('test', 'png') + graph = comp.graph("test", "png") assert isinstance(graph, graphviz.Digraph) def test_returns_digraph_object_comp_graph_with_params(test_component_graph): - graphviz = pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') + graphviz = pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) comp = test_component_graph - parameters = {'OneHot_RandomForest': {'top_n': 3}, - 'OneHot_ElasticNet': {'top_n': 5}, - 'Elastic Net': {'max_iter': 100}} + parameters = { + "OneHot_RandomForest": {"top_n": 3}, + "OneHot_ElasticNet": {"top_n": 5}, + "Elastic Net": {"max_iter": 100}, + } comp.instantiate(parameters) - graph = comp.graph('test', 'png') + graph = comp.graph("test", "png") assert isinstance(graph, graphviz.Digraph) - assert 'top_n : 3' in graph.source - assert 'top_n : 5' in graph.source - assert 'max_iter : 100' in graph.source + assert "top_n : 3" in graph.source + assert "top_n : 5" in graph.source + assert "max_iter : 100" in graph.source def test_missing_file_extension(tmpdir, test_pipeline): - pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') - filepath = os.path.join(str(tmpdir), 'test1') + pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) + filepath = os.path.join(str(tmpdir), "test1") pipeline = test_pipeline with pytest.raises(ValueError, match="Unknown format"): pipeline.graph(filepath=filepath) def test_invalid_format(tmpdir, test_pipeline): - pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') - filepath = os.path.join(str(tmpdir), 'test1.xyz') + pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) + filepath = os.path.join(str(tmpdir), "test1.xyz") pipeline = test_pipeline with pytest.raises(ValueError, match="Unknown format"): pipeline.graph(filepath=filepath) def test_invalid_path(tmpdir, test_pipeline): - pytest.importorskip('graphviz', reason='Skipping plotting test because graphviz not installed') - filepath = os.path.join(str(tmpdir), 'invalid', 'path', 'pipeline.png') + pytest.importorskip( + "graphviz", reason="Skipping plotting test because graphviz not installed" + ) + filepath = os.path.join(str(tmpdir), "invalid", "path", "pipeline.png") assert not os.path.exists(filepath) pipeline = test_pipeline with pytest.raises(ValueError, match="Specified filepath is not writeable"): @@ -122,7 +155,10 @@ def test_invalid_path(tmpdir, test_pipeline): def test_graph_feature_importance(X_y_binary, test_pipeline): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary clf = test_pipeline clf.fit(X, y) @@ -130,7 +166,10 @@ def test_graph_feature_importance(X_y_binary, test_pipeline): def test_graph_feature_importance_show_all_features(X_y_binary, test_pipeline): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary clf = test_pipeline clf.fit(X, y) @@ -139,28 +178,37 @@ def test_graph_feature_importance_show_all_features(X_y_binary, test_pipeline): assert isinstance(figure, go.Figure) data = figure.data[0] - assert (np.any(data['x'] == 0.0)) + assert np.any(data["x"] == 0.0) def test_graph_feature_importance_threshold(X_y_binary, test_pipeline): - go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + go = pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary clf = test_pipeline clf.fit(X, y) - with pytest.raises(ValueError, match="Provided importance threshold of -0.0001 must be greater than or equal to 0"): + with pytest.raises( + ValueError, + match="Provided importance threshold of -0.0001 must be greater than or equal to 0", + ): figure = clf.graph_feature_importance(importance_threshold=-0.0001) figure = clf.graph_feature_importance(importance_threshold=0.5) assert isinstance(figure, go.Figure) data = figure.data[0] - assert (np.all(data['x'] >= 0.5)) + assert np.all(data["x"] >= 0.5) -@patch('evalml.pipelines.pipeline_base.jupyter_check') -@patch('evalml.pipelines.pipeline_base.import_or_raise') +@patch("evalml.pipelines.pipeline_base.jupyter_check") +@patch("evalml.pipelines.pipeline_base.import_or_raise") def test_jupyter_graph_check(import_check, jupyter_check, X_y_binary, test_pipeline): - pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') + pytest.importorskip( + "plotly.graph_objects", + reason="Skipping plotting test because plotly not installed", + ) X, y = X_y_binary clf = test_pipeline clf.fit(X, y) @@ -172,4 +220,4 @@ def test_jupyter_graph_check(import_check, jupyter_check, X_y_binary, test_pipel jupyter_check.return_value = True with pytest.warns(None) as graph_valid: clf.graph_feature_importance() - import_check.assert_called_with('ipywidgets', warning=True) + import_check.assert_called_with("ipywidgets", warning=True) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 5d587498fb..b35b658e29 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1,4 +1,3 @@ - import numpy as np import pandas as pd import pytest @@ -9,7 +8,7 @@ from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, - RegressionPipeline + RegressionPipeline, ) from evalml.pipelines.components import ( DateTimeFeaturizer, @@ -26,14 +25,14 @@ StandardScaler, TargetImputer, TextFeaturizer, - Transformer + Transformer, ) from evalml.pipelines.utils import ( _get_pipeline_base_class, _make_component_list_from_actions, generate_pipeline_code, get_estimators, - make_pipeline + make_pipeline, ) from evalml.problem_types import ProblemTypes, is_time_series @@ -42,10 +41,11 @@ @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): # testing that all_null column is not considered categorical - X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], - "num": [1, 2, 3, 4, 5]}) + X = pd.DataFrame( + {"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], "num": [1, 2, 3, 4, 5]} + ) y = pd.Series([0, 0, 1, 1, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) @@ -58,29 +58,45 @@ def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": None, "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) delayed_features = [] - if is_time_series(problem_type) and estimator_class.model_family != ModelFamily.ARIMA: + if ( + is_time_series(problem_type) + and estimator_class.model_family != ModelFamily.ARIMA + ): delayed_features = [DelayedFeatureTransformer] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] else: estimator_components = [estimator_class] - assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + delayed_features + estimator_components + ) @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline(input_type, problem_type): - X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], - "categorical": ["a", "b", "a", "c", "c"], - "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) + X = pd.DataFrame( + { + "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], + "categorical": ["a", "b", "a", "c", "c"], + "some dates": pd.date_range("2000-02-03", periods=5, freq="W"), + } + ) y = pd.Series([0, 0, 1, 0, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) @@ -93,8 +109,14 @@ def test_make_pipeline(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": "some dates", "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": "some dates", "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": "some dates", "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": "some dates", + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -108,19 +130,31 @@ def test_make_pipeline(input_type, problem_type): else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: - assert pipeline.component_graph == [DropNullColumns, Imputer] + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + estimator_components + ) else: - assert pipeline.component_graph == [DropNullColumns, Imputer, DateTimeFeaturizer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer, DateTimeFeaturizer] + + delayed_features + + estimator_components + ) @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_no_nulls(input_type, problem_type): - X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], - "categorical": ["a", "b", "a", "c", "c"], - "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) + X = pd.DataFrame( + { + "numerical": [1, 2, 3, 1, 2], + "categorical": ["a", "b", "a", "c", "c"], + "some dates": pd.date_range("2000-02-03", periods=5, freq="W"), + } + ) y = pd.Series([0, 1, 1, 0, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) @@ -133,8 +167,14 @@ def test_make_pipeline_no_nulls(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": "some dates", "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": "some dates", "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": "some dates", "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": "some dates", + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -150,17 +190,26 @@ def test_make_pipeline_no_nulls(input_type, problem_type): if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [Imputer] + estimator_components else: - assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [Imputer, DateTimeFeaturizer] + + delayed_features + + estimator_components + ) @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_no_datetimes(input_type, problem_type): - X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], - "categorical": ["a", "b", "a", "c", "c"], - "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan]}) + X = pd.DataFrame( + { + "numerical": [1, 2, 3, 1, 2], + "categorical": ["a", "b", "a", "c", "c"], + "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) y = pd.Series([0, 1, 1, 0, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) @@ -173,8 +222,14 @@ def test_make_pipeline_no_datetimes(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": None, "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -188,9 +243,17 @@ def test_make_pipeline_no_datetimes(input_type, problem_type): else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: - assert pipeline.component_graph == [DropNullColumns, Imputer] + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + estimator_components + ) else: - assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + + delayed_features + + estimator_components + ) @pytest.mark.parametrize("input_type", ["pd", "ww"]) @@ -198,7 +261,7 @@ def test_make_pipeline_no_datetimes(input_type, problem_type): def test_make_pipeline_no_column_names(input_type, problem_type): X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]]) y = pd.Series([0, 0, 1]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) estimators = get_estimators(problem_type=problem_type) @@ -210,8 +273,14 @@ def test_make_pipeline_no_column_names(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": None, "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -225,19 +294,37 @@ def test_make_pipeline_no_column_names(input_type, problem_type): else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: - assert pipeline.component_graph == [DropNullColumns, Imputer] + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + estimator_components + ) else: - assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + + delayed_features + + estimator_components + ) @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_text_columns(input_type, problem_type): - X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], - "categorical": ["a", "b", "a", "c", "c"], - "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]}) + X = pd.DataFrame( + { + "numerical": [1, 2, 3, 1, 2], + "categorical": ["a", "b", "a", "c", "c"], + "text": [ + "string one", + "another", + "text for a column, this should be a text column!!", + "text string", + "hello world", + ], + } + ) y = pd.Series([0, 0, 1, 1, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) estimators = get_estimators(problem_type=problem_type) @@ -250,8 +337,14 @@ def test_make_pipeline_text_columns(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": None, "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -265,18 +358,42 @@ def test_make_pipeline_text_columns(input_type, problem_type): else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: - assert pipeline.component_graph == [Imputer, TextFeaturizer] + estimator_components + assert ( + pipeline.component_graph + == [Imputer, TextFeaturizer] + estimator_components + ) else: - assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [Imputer, TextFeaturizer] + + delayed_features + + estimator_components + ) @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_only_text_columns(input_type, problem_type): - X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"], - "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]}) + X = pd.DataFrame( + { + "text": [ + "string one", + "the evalml team is full of wonderful people", + "text for a column, this should be a text column!!", + "text string", + "hello world", + ], + "another text": [ + "ladidididididida", + "cats are great", + "text for a column, this should be a text column!!", + "text string", + "goodbye world", + ], + } + ) y = pd.Series([0, 0, 1, 1, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) estimators = get_estimators(problem_type=problem_type) @@ -289,8 +406,14 @@ def test_make_pipeline_only_text_columns(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": None, "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -301,18 +424,26 @@ def test_make_pipeline_only_text_columns(input_type, problem_type): if estimator_class.model_family == ModelFamily.LINEAR_MODEL: standard_scaler = [StandardScaler] if estimator_class.model_family == ModelFamily.ARIMA: - assert pipeline.component_graph == [TextFeaturizer] + standard_scaler + [estimator_class] + assert pipeline.component_graph == [ + TextFeaturizer + ] + standard_scaler + [estimator_class] else: - assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class] + assert pipeline.component_graph == [ + TextFeaturizer + ] + delayed_features + standard_scaler + [estimator_class] @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_only_datetime_columns(input_type, problem_type): - X = pd.DataFrame({"some dates": pd.date_range('2000-02-03', periods=5, freq='W'), - "some other dates": pd.date_range('2000-05-19', periods=5, freq='W')}) + X = pd.DataFrame( + { + "some dates": pd.date_range("2000-02-03", periods=5, freq="W"), + "some other dates": pd.date_range("2000-05-19", periods=5, freq="W"), + } + ) y = pd.Series([0, 0, 1, 1, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) estimators = get_estimators(problem_type=problem_type) @@ -325,8 +456,14 @@ def test_make_pipeline_only_datetime_columns(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": "some dates", "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": "some dates", "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": "some dates", "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": "some dates", + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -339,7 +476,9 @@ def test_make_pipeline_only_datetime_columns(input_type, problem_type): if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == standard_scaler + [estimator_class] else: - assert pipeline.component_graph == [DateTimeFeaturizer] + delayed_features + standard_scaler + [estimator_class] + assert pipeline.component_graph == [ + DateTimeFeaturizer + ] + delayed_features + standard_scaler + [estimator_class] @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) @@ -356,8 +495,14 @@ def test_make_pipeline_numpy_input(problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": None, "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": None, "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": None, + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -369,18 +514,30 @@ def test_make_pipeline_numpy_input(problem_type): else: estimator_components = [estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: - assert pipeline.component_graph == [DropNullColumns, Imputer] + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + estimator_components + ) else: - assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [DropNullColumns, Imputer] + + delayed_features + + estimator_components + ) @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_make_pipeline_datetime_no_categorical(input_type, problem_type): - X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], - "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) + X = pd.DataFrame( + { + "numerical": [1, 2, 3, 1, 2], + "some dates": pd.date_range("2000-02-03", periods=5, freq="W"), + } + ) y = pd.Series([0, 1, 1, 0, 0]) - if input_type == 'ww': + if input_type == "ww": X.ww.init() y = ww.init_series(y) @@ -393,8 +550,14 @@ def test_make_pipeline_datetime_no_categorical(input_type, problem_type): if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): - parameters = {"pipeline": {"date_index": "soem dates", "gap": 1, "max_delay": 1}, - "Time Series Baseline Estimator": {"date_index": "some dates", "gap": 1, "max_delay": 1}} + parameters = { + "pipeline": {"date_index": "soem dates", "gap": 1, "max_delay": 1}, + "Time Series Baseline Estimator": { + "date_index": "some dates", + "gap": 1, + "max_delay": 1, + }, + } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) @@ -410,53 +573,93 @@ def test_make_pipeline_datetime_no_categorical(input_type, problem_type): if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [Imputer] + estimator_components else: - assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components + assert ( + pipeline.component_graph + == [Imputer, DateTimeFeaturizer] + + delayed_features + + estimator_components + ) def test_make_pipeline_problem_type_mismatch(): - with pytest.raises(ValueError, match=f"{LogisticRegressionClassifier.name} is not a valid estimator for problem type"): - make_pipeline(pd.DataFrame(), pd.Series(), LogisticRegressionClassifier, ProblemTypes.REGRESSION) - with pytest.raises(ValueError, match=f"{LinearRegressor.name} is not a valid estimator for problem type"): - make_pipeline(pd.DataFrame(), pd.Series(), LinearRegressor, ProblemTypes.MULTICLASS) - with pytest.raises(ValueError, match=f"{Transformer.name} is not a valid estimator for problem type"): + with pytest.raises( + ValueError, + match=f"{LogisticRegressionClassifier.name} is not a valid estimator for problem type", + ): + make_pipeline( + pd.DataFrame(), + pd.Series(), + LogisticRegressionClassifier, + ProblemTypes.REGRESSION, + ) + with pytest.raises( + ValueError, + match=f"{LinearRegressor.name} is not a valid estimator for problem type", + ): + make_pipeline( + pd.DataFrame(), pd.Series(), LinearRegressor, ProblemTypes.MULTICLASS + ) + with pytest.raises( + ValueError, + match=f"{Transformer.name} is not a valid estimator for problem type", + ): make_pipeline(pd.DataFrame(), pd.Series(), Transformer, ProblemTypes.MULTICLASS) -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_stacked_estimator_in_pipeline(problem_type, X_y_binary, X_y_multi, X_y_regression, - stackable_classifiers, - stackable_regressors, - logistic_regression_binary_pipeline_class, - logistic_regression_multiclass_pipeline_class, - linear_regression_pipeline_class): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_stacked_estimator_in_pipeline( + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, + stackable_classifiers, + stackable_regressors, + logistic_regression_binary_pipeline_class, + logistic_regression_multiclass_pipeline_class, + linear_regression_pipeline_class, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary base_pipeline_class = BinaryClassificationPipeline stacking_component_name = StackedEnsembleClassifier.name - input_pipelines = [BinaryClassificationPipeline([classifier]) for classifier in stackable_classifiers] - comparison_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) - objective = 'Log Loss Binary' + input_pipelines = [ + BinaryClassificationPipeline([classifier]) + for classifier in stackable_classifiers + ] + comparison_pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) + objective = "Log Loss Binary" elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi base_pipeline_class = MulticlassClassificationPipeline stacking_component_name = StackedEnsembleClassifier.name - input_pipelines = [MulticlassClassificationPipeline([classifier]) for classifier in stackable_classifiers] - comparison_pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) - objective = 'Log Loss Multiclass' + input_pipelines = [ + MulticlassClassificationPipeline([classifier]) + for classifier in stackable_classifiers + ] + comparison_pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) + objective = "Log Loss Multiclass" elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression base_pipeline_class = RegressionPipeline stacking_component_name = StackedEnsembleRegressor.name - input_pipelines = [RegressionPipeline([regressor]) for regressor in stackable_regressors] - comparison_pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) - objective = 'R2' + input_pipelines = [ + RegressionPipeline([regressor]) for regressor in stackable_regressors + ] + comparison_pipeline = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) + objective = "R2" parameters = { - stacking_component_name: { - "input_pipelines": input_pipelines, - "n_jobs": 1 - } + stacking_component_name: {"input_pipelines": input_pipelines, "n_jobs": 1} } - graph = ['Simple Imputer', stacking_component_name] + graph = ["Simple Imputer", stacking_component_name] pipeline = base_pipeline_class(component_graph=graph, parameters=parameters) pipeline.fit(X, y) @@ -468,69 +671,131 @@ def test_stacked_estimator_in_pipeline(problem_type, X_y_binary, X_y_multi, X_y_ if problem_type == ProblemTypes.BINARY or problem_type == ProblemTypes.MULTICLASS: assert not np.isnan(pipeline.predict_proba(X)).values.any() - assert (pipeline_score <= comparison_pipeline_score) + assert pipeline_score <= comparison_pipeline_score else: - assert (pipeline_score >= comparison_pipeline_score) + assert pipeline_score >= comparison_pipeline_score def test_make_component_list_from_actions(): assert _make_component_list_from_actions([]) == [] - actions = [DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some col']})] - assert _make_component_list_from_actions(actions) == [DropColumns(columns=['some col'])] - - actions = [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"columns": ['some col']}), - DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "most_frequent"})] - assert _make_component_list_from_actions(actions) == [DropColumns(columns=['some col']), - TargetImputer(impute_strategy="most_frequent")] - - -@pytest.mark.parametrize("samplers", [None, "Undersampler", "SMOTE Oversampler", "SMOTENC Oversampler", "SMOTEN Oversampler"]) -@pytest.mark.parametrize("problem_type", ['binary', 'multiclass', 'regression']) -def test_make_pipeline_samplers(problem_type, samplers, X_y_binary, X_y_multi, X_y_regression, has_minimal_dependencies): - if problem_type == 'binary': + actions = [DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ["some col"]})] + assert _make_component_list_from_actions(actions) == [ + DropColumns(columns=["some col"]) + ] + + actions = [ + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"columns": ["some col"]} + ), + DataCheckAction( + DataCheckActionCode.IMPUTE_COL, + metadata={ + "column": None, + "is_target": True, + "impute_strategy": "most_frequent", + }, + ), + ] + assert _make_component_list_from_actions(actions) == [ + DropColumns(columns=["some col"]), + TargetImputer(impute_strategy="most_frequent"), + ] + + +@pytest.mark.parametrize( + "samplers", + [ + None, + "Undersampler", + "SMOTE Oversampler", + "SMOTENC Oversampler", + "SMOTEN Oversampler", + ], +) +@pytest.mark.parametrize("problem_type", ["binary", "multiclass", "regression"]) +def test_make_pipeline_samplers( + problem_type, + samplers, + X_y_binary, + X_y_multi, + X_y_regression, + has_minimal_dependencies, +): + if problem_type == "binary": X, y = X_y_binary - elif problem_type == 'multiclass': + elif problem_type == "multiclass": X, y = X_y_multi else: X, y = X_y_regression estimators = get_estimators(problem_type=problem_type) for estimator in estimators: - if problem_type == 'regression' and samplers is not None: - with pytest.raises(ValueError, match='Sampling is unsupported for'): + if problem_type == "regression" and samplers is not None: + with pytest.raises(ValueError, match="Sampling is unsupported for"): make_pipeline(X, y, estimator, problem_type, sampler_name=samplers) else: - pipeline = make_pipeline(X, y, estimator, problem_type, sampler_name=samplers) + pipeline = make_pipeline( + X, y, estimator, problem_type, sampler_name=samplers + ) if has_minimal_dependencies and samplers is not None: - samplers = 'Undersampler' + samplers = "Undersampler" # check that we do add the sampler properly - if samplers is not None and problem_type != 'regression': - assert any('sampler' in comp.name for comp in pipeline.component_graph) + if samplers is not None and problem_type != "regression": + assert any("sampler" in comp.name for comp in pipeline.component_graph) else: - assert not any('sampler' in comp.name for comp in pipeline.component_graph) + assert not any( + "sampler" in comp.name for comp in pipeline.component_graph + ) def test_get_estimators(has_minimal_dependencies): if has_minimal_dependencies: assert len(get_estimators(problem_type=ProblemTypes.BINARY)) == 5 - assert len(get_estimators(problem_type=ProblemTypes.BINARY, model_families=[ModelFamily.LINEAR_MODEL])) == 2 + assert ( + len( + get_estimators( + problem_type=ProblemTypes.BINARY, + model_families=[ModelFamily.LINEAR_MODEL], + ) + ) + == 2 + ) assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS)) == 5 assert len(get_estimators(problem_type=ProblemTypes.REGRESSION)) == 5 else: assert len(get_estimators(problem_type=ProblemTypes.BINARY)) == 8 - assert len(get_estimators(problem_type=ProblemTypes.BINARY, model_families=[ModelFamily.LINEAR_MODEL])) == 2 + assert ( + len( + get_estimators( + problem_type=ProblemTypes.BINARY, + model_families=[ModelFamily.LINEAR_MODEL], + ) + ) + == 2 + ) assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS)) == 8 assert len(get_estimators(problem_type=ProblemTypes.REGRESSION)) == 8 assert len(get_estimators(problem_type=ProblemTypes.BINARY, model_families=[])) == 0 - assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS, model_families=[])) == 0 - assert len(get_estimators(problem_type=ProblemTypes.REGRESSION, model_families=[])) == 0 + assert ( + len(get_estimators(problem_type=ProblemTypes.MULTICLASS, model_families=[])) + == 0 + ) + assert ( + len(get_estimators(problem_type=ProblemTypes.REGRESSION, model_families=[])) + == 0 + ) with pytest.raises(RuntimeError, match="Unrecognized model type for problem type"): - get_estimators(problem_type=ProblemTypes.REGRESSION, model_families=["random_forest", "none"]) + get_estimators( + problem_type=ProblemTypes.REGRESSION, + model_families=["random_forest", "none"], + ) with pytest.raises(TypeError, match="model_families parameter is not a list."): - get_estimators(problem_type=ProblemTypes.REGRESSION, model_families='random_forest') + get_estimators( + problem_type=ProblemTypes.REGRESSION, model_families="random_forest" + ) with pytest.raises(KeyError): get_estimators(problem_type="Not A Valid Problem Type") @@ -538,15 +803,15 @@ def test_get_estimators(has_minimal_dependencies): def test_generate_code_pipeline_errors(): class MockBinaryPipeline(BinaryClassificationPipeline): name = "Mock Binary Pipeline" - component_graph = ['Imputer', 'Random Forest Classifier'] + component_graph = ["Imputer", "Random Forest Classifier"] class MockMulticlassPipeline(MulticlassClassificationPipeline): name = "Mock Multiclass Pipeline" - component_graph = ['Imputer', 'Random Forest Classifier'] + component_graph = ["Imputer", "Random Forest Classifier"] class MockRegressionPipeline(RegressionPipeline): name = "Mock Regression Pipeline" - component_graph = ['Imputer', 'Random Forest Regressor'] + component_graph = ["Imputer", "Random Forest Regressor"] with pytest.raises(ValueError, match="Element must be a pipeline instance"): generate_pipeline_code(MockBinaryPipeline) @@ -575,63 +840,91 @@ class CustomEstimator(Estimator): model_family = ModelFamily.NONE def __init__(self, random_arg=False, numpy_arg=[], random_seed=0): - parameters = {'random_arg': random_arg, - 'numpy_arg': numpy_arg} - - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) - - component_graph = ['Imputer', CustomEstimator] - pipeline = BinaryClassificationPipeline(component_graph, custom_name="Mock Binary Pipeline with Transformer", - parameters={'My Custom Estimator': {'numpy_arg': np.array([0])}}) + parameters = {"random_arg": random_arg, "numpy_arg": numpy_arg} + + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) + + component_graph = ["Imputer", CustomEstimator] + pipeline = BinaryClassificationPipeline( + component_graph, + custom_name="Mock Binary Pipeline with Transformer", + parameters={"My Custom Estimator": {"numpy_arg": np.array([0])}}, + ) generated_pipeline_code = generate_pipeline_code(pipeline) - assert generated_pipeline_code == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \ - "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \ + assert ( + generated_pipeline_code + == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" + "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " "'My Custom Estimator':{'random_arg': False, 'numpy_arg': array([0])}}, custom_name='Mock Binary Pipeline with Transformer', random_seed=0)" + ) - pipeline = BinaryClassificationPipeline(component_graph, custom_name="Mock Binary Pipeline with Transformer", - parameters={'My Custom Estimator': {'random_arg': Imputer()}}) + pipeline = BinaryClassificationPipeline( + component_graph, + custom_name="Mock Binary Pipeline with Transformer", + parameters={"My Custom Estimator": {"random_arg": Imputer()}}, + ) generated_pipeline_code = generate_pipeline_code(pipeline) - assert generated_pipeline_code == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \ - "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \ - "'My Custom Estimator':{'random_arg': Imputer(categorical_impute_strategy='most_frequent', numeric_impute_strategy='mean', categorical_fill_value=None, numeric_fill_value=None), 'numpy_arg': []}}, " \ + assert ( + generated_pipeline_code + == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" + "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " + "'My Custom Estimator':{'random_arg': Imputer(categorical_impute_strategy='most_frequent', numeric_impute_strategy='mean', categorical_fill_value=None, numeric_fill_value=None), 'numpy_arg': []}}, " "custom_name='Mock Binary Pipeline with Transformer', random_seed=0)" + ) def test_generate_code_pipeline(): - binary_pipeline = BinaryClassificationPipeline(['Imputer', 'Random Forest Classifier']) - expected_code = "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \ - "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'Random Forest Classifier'], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \ + binary_pipeline = BinaryClassificationPipeline( + ["Imputer", "Random Forest Classifier"] + ) + expected_code = ( + "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" + "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'Random Forest Classifier'], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " "'Random Forest Classifier':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, random_seed=0)" + ) pipeline = generate_pipeline_code(binary_pipeline) assert expected_code == pipeline - regression_pipeline = RegressionPipeline(['Imputer', 'Random Forest Regressor'], custom_name="Mock Regression Pipeline") - expected_code = "from evalml.pipelines.regression_pipeline import RegressionPipeline\n" \ - "pipeline = RegressionPipeline(component_graph=['Imputer', 'Random Forest Regressor'], parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \ + regression_pipeline = RegressionPipeline( + ["Imputer", "Random Forest Regressor"], custom_name="Mock Regression Pipeline" + ) + expected_code = ( + "from evalml.pipelines.regression_pipeline import RegressionPipeline\n" + "pipeline = RegressionPipeline(component_graph=['Imputer', 'Random Forest Regressor'], parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " "'Random Forest Regressor':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Regression Pipeline', random_seed=0)" + ) pipeline = generate_pipeline_code(regression_pipeline) assert pipeline == expected_code - regression_pipeline_with_params = RegressionPipeline(['Imputer', 'Random Forest Regressor'], - custom_name="Mock Regression Pipeline", - parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}, "Random Forest Regressor": {"n_estimators": 50}}) - expected_code_params = "from evalml.pipelines.regression_pipeline import RegressionPipeline\n" \ - "pipeline = RegressionPipeline(component_graph=['Imputer', 'Random Forest Regressor'], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \ + regression_pipeline_with_params = RegressionPipeline( + ["Imputer", "Random Forest Regressor"], + custom_name="Mock Regression Pipeline", + parameters={ + "Imputer": {"numeric_impute_strategy": "most_frequent"}, + "Random Forest Regressor": {"n_estimators": 50}, + }, + ) + expected_code_params = ( + "from evalml.pipelines.regression_pipeline import RegressionPipeline\n" + "pipeline = RegressionPipeline(component_graph=['Imputer', 'Random Forest Regressor'], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None}, " "'Random Forest Regressor':{'n_estimators': 50, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Regression Pipeline', random_seed=0)" + ) pipeline = generate_pipeline_code(regression_pipeline_with_params) assert pipeline == expected_code_params def test_generate_code_nonlinear_pipeline_error(nonlinear_binary_pipeline_class): pipeline = nonlinear_binary_pipeline_class({}) - with pytest.raises(ValueError, match="Code generation for nonlinear pipelines is not supported yet"): + with pytest.raises( + ValueError, match="Code generation for nonlinear pipelines is not supported yet" + ): generate_pipeline_code(pipeline) @@ -643,9 +936,9 @@ class CustomTransformer(Transformer): def __init__(self, random_seed=0): parameters = {} - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) class CustomEstimator(Estimator): name = "My Custom Estimator" @@ -654,15 +947,19 @@ class CustomEstimator(Estimator): model_family = ModelFamily.NONE def __init__(self, random_arg=False, random_seed=0): - parameters = {'random_arg': random_arg} - - super().__init__(parameters=parameters, - component_obj=None, - random_seed=random_seed) - - mock_pipeline_with_custom_components = BinaryClassificationPipeline([CustomTransformer, CustomEstimator]) - expected_code = "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \ - "pipeline = BinaryClassificationPipeline(component_graph=[CustomTransformer, CustomEstimator], " \ + parameters = {"random_arg": random_arg} + + super().__init__( + parameters=parameters, component_obj=None, random_seed=random_seed + ) + + mock_pipeline_with_custom_components = BinaryClassificationPipeline( + [CustomTransformer, CustomEstimator] + ) + expected_code = ( + "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" + "pipeline = BinaryClassificationPipeline(component_graph=[CustomTransformer, CustomEstimator], " "parameters={'My Custom Estimator':{'random_arg': False}}, random_seed=0)" + ) pipeline = generate_pipeline_code(mock_pipeline_with_custom_components) assert pipeline == expected_code diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index fbec488d57..0fce0925ed 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -14,20 +14,15 @@ ObjectiveCreationError, ObjectiveNotFoundError, PipelineNotYetFittedError, - PipelineScoreError + PipelineScoreError, ) from evalml.model_family import ModelFamily -from evalml.objectives import ( - CostBenefitMatrix, - FraudCost, - Precision, - get_objective -) +from evalml.objectives import CostBenefitMatrix, FraudCost, Precision, get_objective from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, PipelineBase, - RegressionPipeline + RegressionPipeline, ) from evalml.pipelines.components import ( ElasticNetClassifier, @@ -37,33 +32,45 @@ RandomForestClassifier, RFClassifierSelectFromModel, StandardScaler, - Transformer + Transformer, ) from evalml.pipelines.components.utils import ( _all_estimators_used_in_search, - allowed_model_families + allowed_model_families, ) from evalml.preprocessing.utils import is_classification -from evalml.problem_types import ( - ProblemTypes, - is_binary, - is_multiclass, - is_time_series -) +from evalml.problem_types import ProblemTypes, is_binary, is_multiclass, is_time_series def test_allowed_model_families(has_minimal_dependencies): - families = [ModelFamily.RANDOM_FOREST, ModelFamily.LINEAR_MODEL, ModelFamily.EXTRA_TREES, ModelFamily.DECISION_TREE] + families = [ + ModelFamily.RANDOM_FOREST, + ModelFamily.LINEAR_MODEL, + ModelFamily.EXTRA_TREES, + ModelFamily.DECISION_TREE, + ] expected_model_families_binary = set(families) expected_model_families_regression = set(families) if not has_minimal_dependencies: - expected_model_families_binary.update([ModelFamily.XGBOOST, ModelFamily.CATBOOST, ModelFamily.LIGHTGBM]) - expected_model_families_regression.update([ModelFamily.CATBOOST, ModelFamily.XGBOOST, ModelFamily.LIGHTGBM]) - assert set(allowed_model_families(ProblemTypes.BINARY)) == expected_model_families_binary - assert set(allowed_model_families(ProblemTypes.REGRESSION)) == expected_model_families_regression + expected_model_families_binary.update( + [ModelFamily.XGBOOST, ModelFamily.CATBOOST, ModelFamily.LIGHTGBM] + ) + expected_model_families_regression.update( + [ModelFamily.CATBOOST, ModelFamily.XGBOOST, ModelFamily.LIGHTGBM] + ) + assert ( + set(allowed_model_families(ProblemTypes.BINARY)) + == expected_model_families_binary + ) + assert ( + set(allowed_model_families(ProblemTypes.REGRESSION)) + == expected_model_families_regression + ) -def test_all_estimators(has_minimal_dependencies, is_running_py_39_or_above, is_using_conda): +def test_all_estimators( + has_minimal_dependencies, is_running_py_39_or_above, is_using_conda +): if has_minimal_dependencies: assert len((_all_estimators_used_in_search())) == 10 else: @@ -84,69 +91,84 @@ class TestPipelineWithoutComponentGraph(PipelineBase): def test_serialization(X_y_binary, tmpdir, logistic_regression_binary_pipeline_class): X, y = X_y_binary - path = os.path.join(str(tmpdir), 'pipe.pkl') - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + path = os.path.join(str(tmpdir), "pipe.pkl") + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) pipeline.save(path) - assert pipeline.score(X, y, ['precision']) == PipelineBase.load(path).score(X, y, ['precision']) + assert pipeline.score(X, y, ["precision"]) == PipelineBase.load(path).score( + X, y, ["precision"] + ) -@patch('cloudpickle.dump') -def test_serialization_protocol(mock_cloudpickle_dump, tmpdir, logistic_regression_binary_pipeline_class): - path = os.path.join(str(tmpdir), 'pipe.pkl') +@patch("cloudpickle.dump") +def test_serialization_protocol( + mock_cloudpickle_dump, tmpdir, logistic_regression_binary_pipeline_class +): + path = os.path.join(str(tmpdir), "pipe.pkl") pipeline = logistic_regression_binary_pipeline_class(parameters={}) pipeline.save(path) assert len(mock_cloudpickle_dump.call_args_list) == 1 - assert mock_cloudpickle_dump.call_args_list[0][1]['protocol'] == cloudpickle.DEFAULT_PROTOCOL + assert ( + mock_cloudpickle_dump.call_args_list[0][1]["protocol"] + == cloudpickle.DEFAULT_PROTOCOL + ) mock_cloudpickle_dump.reset_mock() pipeline.save(path, pickle_protocol=42) assert len(mock_cloudpickle_dump.call_args_list) == 1 - assert mock_cloudpickle_dump.call_args_list[0][1]['protocol'] == 42 + assert mock_cloudpickle_dump.call_args_list[0][1]["protocol"] == 42 @pytest.fixture -def pickled_pipeline_path(X_y_binary, tmpdir, logistic_regression_binary_pipeline_class): +def pickled_pipeline_path( + X_y_binary, tmpdir, logistic_regression_binary_pipeline_class +): X, y = X_y_binary - path = os.path.join(str(tmpdir), 'pickled_pipe.pkl') - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + path = os.path.join(str(tmpdir), "pickled_pipe.pkl") + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) pipeline.save(path) return path -def test_load_pickled_pipeline_with_custom_objective(X_y_binary, pickled_pipeline_path, logistic_regression_binary_pipeline_class): +def test_load_pickled_pipeline_with_custom_objective( + X_y_binary, pickled_pipeline_path, logistic_regression_binary_pipeline_class +): X, y = X_y_binary # checks that class is not defined before loading in pipeline with pytest.raises(NameError): MockPrecision() # noqa: F821: ignore flake8's "undefined name" error objective = Precision() - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) pipeline.fit(X, y) - assert PipelineBase.load(pickled_pipeline_path).score(X, y, [objective]) == pipeline.score(X, y, [objective]) + assert PipelineBase.load(pickled_pipeline_path).score( + X, y, [objective] + ) == pipeline.score(X, y, [objective]) def test_reproducibility(X_y_binary, logistic_regression_binary_pipeline_class): X, y = X_y_binary objective = FraudCost( - retry_percentage=.5, - interchange_fee=.02, - fraud_payout_percentage=.75, - amount_col=10 + retry_percentage=0.5, + interchange_fee=0.02, + fraud_payout_percentage=0.75, + amount_col=10, ) parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", "numeric_impute_strategy": "mean", }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 1.0, - 'n_jobs': 1 - } + "Logistic Regression Classifier": {"penalty": "l2", "C": 1.0, "n_jobs": 1}, } clf = logistic_regression_binary_pipeline_class(parameters=parameters) @@ -160,17 +182,19 @@ def test_reproducibility(X_y_binary, logistic_regression_binary_pipeline_class): def test_indexing(X_y_binary, logistic_regression_binary_pipeline_class): X, y = X_y_binary - clf = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + clf = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) clf.fit(X, y) assert isinstance(clf[1], OneHotEncoder) - assert isinstance(clf['Imputer'], Imputer) + assert isinstance(clf["Imputer"], Imputer) - setting_err_msg = 'Setting pipeline components is not supported.' + setting_err_msg = "Setting pipeline components is not supported." with pytest.raises(NotImplementedError, match=setting_err_msg): clf[1] = OneHotEncoder() - slicing_err_msg = 'Slicing pipelines is currently not supported.' + slicing_err_msg = "Slicing pipelines is currently not supported." with pytest.raises(NotImplementedError, match=slicing_err_msg): clf[:1] @@ -178,32 +202,112 @@ def test_indexing(X_y_binary, logistic_regression_binary_pipeline_class): @pytest.mark.parametrize("is_linear", [True, False]) @pytest.mark.parametrize("is_fitted", [True, False]) @pytest.mark.parametrize("return_dict", [True, False]) -def test_describe_pipeline(is_linear, is_fitted, return_dict, - X_y_binary, caplog, logistic_regression_binary_pipeline_class, nonlinear_binary_pipeline_class): +def test_describe_pipeline( + is_linear, + is_fitted, + return_dict, + X_y_binary, + caplog, + logistic_regression_binary_pipeline_class, + nonlinear_binary_pipeline_class, +): X, y = X_y_binary if is_linear: pipeline = logistic_regression_binary_pipeline_class(parameters={}) name = "Logistic Regression Binary Pipeline" - expected_pipeline_dict = {'name': name, - 'problem_type': ProblemTypes.BINARY, - 'model_family': ModelFamily.LINEAR_MODEL, - 'components': {'Imputer': {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}}, - 'One Hot Encoder': {'name': 'One Hot Encoder', 'parameters': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}}, - 'Standard Scaler': {'name': 'Standard Scaler', 'parameters': {}}, - 'Logistic Regression Classifier': {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}}}} + expected_pipeline_dict = { + "name": name, + "problem_type": ProblemTypes.BINARY, + "model_family": ModelFamily.LINEAR_MODEL, + "components": { + "Imputer": { + "name": "Imputer", + "parameters": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, + }, + }, + "One Hot Encoder": { + "name": "One Hot Encoder", + "parameters": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", + }, + }, + "Standard Scaler": {"name": "Standard Scaler", "parameters": {}}, + "Logistic Regression Classifier": { + "name": "Logistic Regression Classifier", + "parameters": { + "penalty": "l2", + "C": 1.0, + "n_jobs": -1, + "multi_class": "auto", + "solver": "lbfgs", + }, + }, + }, + } else: pipeline = nonlinear_binary_pipeline_class(parameters={}) name = "Non Linear Binary Pipeline" expected_pipeline_dict = { - 'name': name, - 'problem_type': ProblemTypes.BINARY, - 'model_family': ModelFamily.LINEAR_MODEL, - 'components': {'Imputer': {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}}, - 'One Hot Encoder': {'name': 'One Hot Encoder', 'parameters': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}}, - 'Elastic Net Classifier': {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.0001, 'l1_ratio': 0.15, 'n_jobs': -1, 'max_iter': 1000, 'penalty': 'elasticnet', 'loss': 'log'}}, - 'Random Forest Classifier': {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, - 'Logistic Regression Classifier': {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}}} + "name": name, + "problem_type": ProblemTypes.BINARY, + "model_family": ModelFamily.LINEAR_MODEL, + "components": { + "Imputer": { + "name": "Imputer", + "parameters": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, + }, + }, + "One Hot Encoder": { + "name": "One Hot Encoder", + "parameters": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", + }, + }, + "Elastic Net Classifier": { + "name": "Elastic Net Classifier", + "parameters": { + "alpha": 0.0001, + "l1_ratio": 0.15, + "n_jobs": -1, + "max_iter": 1000, + "penalty": "elasticnet", + "loss": "log", + }, + }, + "Random Forest Classifier": { + "name": "Random Forest Classifier", + "parameters": {"n_estimators": 100, "max_depth": 6, "n_jobs": -1}, + }, + "Logistic Regression Classifier": { + "name": "Logistic Regression Classifier", + "parameters": { + "penalty": "l2", + "C": 1.0, + "n_jobs": -1, + "multi_class": "auto", + "solver": "lbfgs", + }, + }, + }, } if is_fitted: @@ -234,24 +338,36 @@ def test_describe_pipeline(is_linear, is_fitted, return_dict, def test_nonlinear_model_family(): class DummyNonlinearPipeline(BinaryClassificationPipeline): - component_graph = {'Imputer': ['Imputer'], - 'OneHot': ['One Hot Encoder', 'Imputer.x'], - 'Elastic Net': ['Elastic Net Classifier', 'OneHot.x'], - 'Logistic Regression': ['Logistic Regression Classifier', 'OneHot.x'], - 'Random Forest': ['Random Forest Classifier', 'Logistic Regression', 'Elastic Net']} + component_graph = { + "Imputer": ["Imputer"], + "OneHot": ["One Hot Encoder", "Imputer.x"], + "Elastic Net": ["Elastic Net Classifier", "OneHot.x"], + "Logistic Regression": ["Logistic Regression Classifier", "OneHot.x"], + "Random Forest": [ + "Random Forest Classifier", + "Logistic Regression", + "Elastic Net", + ], + } def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) class DummyTransformerEndPipeline(BinaryClassificationPipeline): - component_graph = {'Imputer': ['Imputer'], - 'OneHot': ['One Hot Encoder', 'Imputer.x'], - 'Random Forest': ['Random Forest Classifier', 'OneHot.x'], - 'Logistic Regression': ['Logistic Regression Classifier', 'OneHot.x'], - 'Scaler': ['Standard Scaler', 'Random Forest', 'Logistic Regression']} + component_graph = { + "Imputer": ["Imputer"], + "OneHot": ["One Hot Encoder", "Imputer.x"], + "Random Forest": ["Random Forest Classifier", "OneHot.x"], + "Logistic Regression": ["Logistic Regression Classifier", "OneHot.x"], + "Scaler": ["Standard Scaler", "Random Forest", "Logistic Regression"], + } def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, random_seed=random_seed) + super().__init__( + self.component_graph, parameters=parameters, random_seed=random_seed + ) nlbp = DummyNonlinearPipeline({}) nltp = DummyTransformerEndPipeline({}) @@ -262,305 +378,348 @@ def __init__(self, parameters, random_seed=0): def test_parameters(logistic_regression_binary_pipeline_class): parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", - "numeric_impute_strategy": "median" + "numeric_impute_strategy": "median", + }, + "Logistic Regression Classifier": { + "penalty": "l2", + "C": 3.0, }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 3.0, - } } lrp = logistic_regression_binary_pipeline_class(parameters=parameters) expected_parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", "numeric_impute_strategy": "median", - 'categorical_fill_value': None, - 'numeric_fill_value': None + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", + }, + "Logistic Regression Classifier": { + "penalty": "l2", + "C": 3.0, + "n_jobs": -1, + "multi_class": "auto", + "solver": "lbfgs", }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 3.0, - 'n_jobs': -1, - 'multi_class': 'auto', - 'solver': 'lbfgs' - } } assert lrp.parameters == expected_parameters def test_parameters_nonlinear(nonlinear_binary_pipeline_class): parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", - "numeric_impute_strategy": "median" + "numeric_impute_strategy": "median", + }, + "Logistic Regression": { + "penalty": "l2", + "C": 3.0, }, - 'Logistic Regression': { - 'penalty': 'l2', - 'C': 3.0, - } } nlbp = nonlinear_binary_pipeline_class(parameters=parameters) expected_parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", "numeric_impute_strategy": "median", - 'categorical_fill_value': None, - 'numeric_fill_value': None + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'OneHot_RandomForest': { - 'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "OneHot_RandomForest": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'OneHot_ElasticNet': { - 'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "OneHot_ElasticNet": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", }, - 'Random Forest': { - 'max_depth': 6, - 'n_estimators': 100, - 'n_jobs': -1 + "Random Forest": {"max_depth": 6, "n_estimators": 100, "n_jobs": -1}, + "Elastic Net": { + "alpha": 0.0001, + "l1_ratio": 0.15, + "loss": "log", + "max_iter": 1000, + "n_jobs": -1, + "penalty": "elasticnet", }, - 'Elastic Net': { - 'alpha': 0.0001, - 'l1_ratio': 0.15, - 'loss': 'log', - 'max_iter': 1000, - 'n_jobs': -1, - 'penalty': 'elasticnet' + "Logistic Regression": { + "penalty": "l2", + "C": 3.0, + "n_jobs": -1, + "multi_class": "auto", + "solver": "lbfgs", }, - 'Logistic Regression': { - 'penalty': 'l2', - 'C': 3.0, - 'n_jobs': -1, - 'multi_class': 'auto', - 'solver': 'lbfgs' - } } assert nlbp.parameters == expected_parameters def test_name(): - pipeline = BinaryClassificationPipeline(component_graph=['Logistic Regression Classifier']) + pipeline = BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"] + ) assert pipeline.name == "Logistic Regression Classifier" assert pipeline.custom_name is None - pipeline_with_custom_name = BinaryClassificationPipeline(component_graph=['Logistic Regression Classifier'], custom_name="Cool Logistic Regression") + pipeline_with_custom_name = BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], + custom_name="Cool Logistic Regression", + ) assert pipeline_with_custom_name.name == "Cool Logistic Regression" assert pipeline_with_custom_name.custom_name == "Cool Logistic Regression" - pipeline_with_neat_name = BinaryClassificationPipeline(component_graph=['Logistic Regression Classifier'], custom_name="some_neat_name") + pipeline_with_neat_name = BinaryClassificationPipeline( + component_graph=["Logistic Regression Classifier"], custom_name="some_neat_name" + ) assert pipeline_with_neat_name.name == "some_neat_name" assert pipeline_with_neat_name.custom_name == "some_neat_name" def test_multi_format_creation(X_y_binary): X, y = X_y_binary - component_graph = component_graph = ['Imputer', 'One Hot Encoder', StandardScaler, 'Logistic Regression Classifier'] + component_graph = component_graph = [ + "Imputer", + "One Hot Encoder", + StandardScaler, + "Logistic Regression Classifier", + ] parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", "numeric_impute_strategy": "mean", }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 1.0, - 'n_jobs': 1 - } + "Logistic Regression Classifier": {"penalty": "l2", "C": 1.0, "n_jobs": 1}, } - clf = BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) - correct_components = [Imputer, OneHotEncoder, StandardScaler, LogisticRegressionClassifier] + clf = BinaryClassificationPipeline( + component_graph=component_graph, parameters=parameters + ) + correct_components = [ + Imputer, + OneHotEncoder, + StandardScaler, + LogisticRegressionClassifier, + ] for component, correct_components in zip(clf, correct_components): assert isinstance(component, correct_components) assert clf.model_family == ModelFamily.LINEAR_MODEL clf.fit(X, y) - clf.score(X, y, ['precision']) + clf.score(X, y, ["precision"]) assert not clf.feature_importance.isnull().all().all() def test_multiple_feature_selectors(X_y_binary): X, y = X_y_binary - component_graph = ['Imputer', 'One Hot Encoder', 'RF Classifier Select From Model', StandardScaler, 'RF Classifier Select From Model', 'Logistic Regression Classifier'] - - clf = BinaryClassificationPipeline(component_graph=component_graph, - parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) - correct_components = [Imputer, OneHotEncoder, RFClassifierSelectFromModel, StandardScaler, RFClassifierSelectFromModel, LogisticRegressionClassifier] + component_graph = [ + "Imputer", + "One Hot Encoder", + "RF Classifier Select From Model", + StandardScaler, + "RF Classifier Select From Model", + "Logistic Regression Classifier", + ] + + clf = BinaryClassificationPipeline( + component_graph=component_graph, + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, + ) + correct_components = [ + Imputer, + OneHotEncoder, + RFClassifierSelectFromModel, + StandardScaler, + RFClassifierSelectFromModel, + LogisticRegressionClassifier, + ] for component, correct_components in zip(clf, correct_components): assert isinstance(component, correct_components) assert clf.model_family == ModelFamily.LINEAR_MODEL clf.fit(X, y) - clf.score(X, y, ['precision']) + clf.score(X, y, ["precision"]) assert not clf.feature_importance.isnull().all().all() def test_problem_types(): - with pytest.raises(ValueError, match="not valid for this component graph. Valid problem types include *."): - BinaryClassificationPipeline(component_graph=['Random Forest Regressor'], - parameters={}) + with pytest.raises( + ValueError, + match="not valid for this component graph. Valid problem types include *.", + ): + BinaryClassificationPipeline( + component_graph=["Random Forest Regressor"], parameters={} + ) def make_mock_regression_pipeline(): - return RegressionPipeline(component_graph=['Random Forest Regressor'], parameters={}) + return RegressionPipeline( + component_graph=["Random Forest Regressor"], parameters={} + ) def make_mock_binary_pipeline(): - return BinaryClassificationPipeline(component_graph=['Random Forest Classifier'], parameters={}) + return BinaryClassificationPipeline( + component_graph=["Random Forest Classifier"], parameters={} + ) def make_mock_multiclass_pipeline(): - return MulticlassClassificationPipeline(component_graph=['Random Forest Classifier'], parameters={}) + return MulticlassClassificationPipeline( + component_graph=["Random Forest Classifier"], parameters={} + ) -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.RegressionPipeline.predict') +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch("evalml.pipelines.RegressionPipeline.predict") def test_score_regression_single(mock_predict, mock_fit, X_y_regression): X, y = X_y_regression mock_predict.return_value = pd.Series(y) clf = make_mock_regression_pipeline() clf.fit(X, y) - objective_names = ['r2'] + objective_names = ["r2"] scores = clf.score(X, y, objective_names) mock_predict.assert_called() - assert scores == {'R2': 1.0} + assert scores == {"R2": 1.0} -@patch('evalml.pipelines.ComponentGraph.fit') -@patch('evalml.pipelines.RegressionPipeline.predict') -def test_score_nonlinear_regression(mock_predict, mock_fit, nonlinear_regression_pipeline_class, X_y_regression): +@patch("evalml.pipelines.ComponentGraph.fit") +@patch("evalml.pipelines.RegressionPipeline.predict") +def test_score_nonlinear_regression( + mock_predict, mock_fit, nonlinear_regression_pipeline_class, X_y_regression +): X, y = X_y_regression mock_predict.return_value = pd.Series(y) clf = nonlinear_regression_pipeline_class({}) clf.fit(X, y) - objective_names = ['r2'] + objective_names = ["r2"] scores = clf.score(X, y, objective_names) mock_predict.assert_called() - assert scores == {'R2': 1.0} + assert scores == {"R2": 1.0} -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.BinaryClassificationPipeline._encode_targets") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_score_binary_single(mock_predict, mock_fit, mock_encode, X_y_binary): X, y = X_y_binary mock_predict.return_value = y mock_encode.return_value = y clf = make_mock_binary_pipeline() clf.fit(X, y) - objective_names = ['f1'] + objective_names = ["f1"] scores = clf.score(X, y, objective_names) mock_encode.assert_called() mock_fit.assert_called() mock_predict.assert_called() - assert scores == {'F1': 1.0} + assert scores == {"F1": 1.0} -@patch('evalml.pipelines.MulticlassClassificationPipeline._encode_targets') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.MulticlassClassificationPipeline._encode_targets") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_score_multiclass_single(mock_predict, mock_fit, mock_encode, X_y_binary): X, y = X_y_binary mock_predict.return_value = y mock_encode.return_value = y clf = make_mock_multiclass_pipeline() clf.fit(X, y) - objective_names = ['f1 micro'] + objective_names = ["f1 micro"] scores = clf.score(X, y, objective_names) mock_encode.assert_called() mock_fit.assert_called() mock_predict.assert_called() - assert scores == {'F1 Micro': 1.0} + assert scores == {"F1 Micro": 1.0} -@patch('evalml.pipelines.MulticlassClassificationPipeline._encode_targets') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.ComponentGraph.predict') -def test_score_nonlinear_multiclass(mock_predict, mock_fit, mock_encode, nonlinear_multiclass_pipeline_class, X_y_multi): +@patch("evalml.pipelines.MulticlassClassificationPipeline._encode_targets") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.ComponentGraph.predict") +def test_score_nonlinear_multiclass( + mock_predict, mock_fit, mock_encode, nonlinear_multiclass_pipeline_class, X_y_multi +): X, y = X_y_multi mock_predict.return_value = pd.Series(y) mock_encode.return_value = pd.Series(y) clf = nonlinear_multiclass_pipeline_class({}) clf.fit(X, y) - objective_names = ['f1 micro', 'precision micro'] + objective_names = ["f1 micro", "precision micro"] scores = clf.score(X, y, objective_names) mock_predict.assert_called() - assert scores == {'F1 Micro': 1.0, 'Precision Micro': 1.0} + assert scores == {"F1 Micro": 1.0, "Precision Micro": 1.0} -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.RegressionPipeline.predict') +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch("evalml.pipelines.RegressionPipeline.predict") def test_score_regression_list(mock_predict, mock_fit, X_y_binary): X, y = X_y_binary mock_predict.return_value = pd.Series(y) clf = make_mock_regression_pipeline() clf.fit(X, y) - objective_names = ['r2', 'mse'] + objective_names = ["r2", "mse"] scores = clf.score(X, y, objective_names) mock_predict.assert_called() - assert scores == {'R2': 1.0, 'MSE': 0.0} + assert scores == {"R2": 1.0, "MSE": 0.0} -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.BinaryClassificationPipeline._encode_targets") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_score_binary_list(mock_predict, mock_fit, mock_encode, X_y_binary): X, y = X_y_binary mock_predict.return_value = y mock_encode.return_value = y clf = make_mock_binary_pipeline() clf.fit(X, y) - objective_names = ['f1', 'precision'] + objective_names = ["f1", "precision"] scores = clf.score(X, y, objective_names) mock_fit.assert_called() mock_encode.assert_called() mock_predict.assert_called() - assert scores == {'F1': 1.0, 'Precision': 1.0} + assert scores == {"F1": 1.0, "Precision": 1.0} -@patch('evalml.pipelines.MulticlassClassificationPipeline._encode_targets') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.components.Estimator.predict') +@patch("evalml.pipelines.MulticlassClassificationPipeline._encode_targets") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.components.Estimator.predict") def test_score_multi_list(mock_predict, mock_fit, mock_encode, X_y_binary): X, y = X_y_binary mock_predict.return_value = y mock_encode.return_value = y clf = make_mock_multiclass_pipeline() clf.fit(X, y) - objective_names = ['f1 micro', 'precision micro'] + objective_names = ["f1 micro", "precision micro"] scores = clf.score(X, y, objective_names) mock_predict.assert_called() - assert scores == {'F1 Micro': 1.0, 'Precision Micro': 1.0} + assert scores == {"F1 Micro": 1.0, "Precision Micro": 1.0} -@patch('evalml.objectives.R2.score') -@patch('evalml.pipelines.RegressionPipeline.fit') -@patch('evalml.pipelines.RegressionPipeline.predict') -def test_score_regression_objective_error(mock_predict, mock_fit, mock_objective_score, X_y_binary): - mock_objective_score.side_effect = Exception('finna kabooom 💣') +@patch("evalml.objectives.R2.score") +@patch("evalml.pipelines.RegressionPipeline.fit") +@patch("evalml.pipelines.RegressionPipeline.predict") +def test_score_regression_objective_error( + mock_predict, mock_fit, mock_objective_score, X_y_binary +): + mock_objective_score.side_effect = Exception("finna kabooom 💣") X, y = X_y_binary mock_predict.return_value = pd.Series(y) clf = make_mock_regression_pipeline() clf.fit(X, y) - objective_names = ['r2', 'mse'] + objective_names = ["r2", "mse"] # Using pytest.raises to make sure we error if an error is not thrown. with pytest.raises(PipelineScoreError): _ = clf.score(X, y, objective_names) @@ -568,22 +727,24 @@ def test_score_regression_objective_error(mock_predict, mock_fit, mock_objective _ = clf.score(X, y, objective_names) except PipelineScoreError as e: assert e.scored_successfully == {"MSE": 0.0} - assert 'finna kabooom 💣' in e.message + assert "finna kabooom 💣" in e.message assert "R2" in e.exceptions -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets') -@patch('evalml.objectives.F1.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.components.Estimator.predict') -def test_score_binary_objective_error(mock_predict, mock_fit, mock_objective_score, mock_encode, X_y_binary): - mock_objective_score.side_effect = Exception('finna kabooom 💣') +@patch("evalml.pipelines.BinaryClassificationPipeline._encode_targets") +@patch("evalml.objectives.F1.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.pipelines.components.Estimator.predict") +def test_score_binary_objective_error( + mock_predict, mock_fit, mock_objective_score, mock_encode, X_y_binary +): + mock_objective_score.side_effect = Exception("finna kabooom 💣") X, y = X_y_binary mock_predict.return_value = y mock_encode.return_value = y clf = make_mock_binary_pipeline() clf.fit(X, y) - objective_names = ['f1', 'precision'] + objective_names = ["f1", "precision"] # Using pytest.raises to make sure we error if an error is not thrown. with pytest.raises(PipelineScoreError): _ = clf.score(X, y, objective_names) @@ -591,21 +752,28 @@ def test_score_binary_objective_error(mock_predict, mock_fit, mock_objective_sco _ = clf.score(X, y, objective_names) except PipelineScoreError as e: assert e.scored_successfully == {"Precision": 1.0} - assert 'finna kabooom 💣' in e.message - - -@patch('evalml.pipelines.BinaryClassificationPipeline._encode_targets') -@patch('evalml.objectives.F1.score') -@patch('evalml.pipelines.BinaryClassificationPipeline.fit') -@patch('evalml.pipelines.ComponentGraph.predict') -def test_score_nonlinear_binary_objective_error(mock_predict, mock_fit, mock_objective_score, mock_encode, nonlinear_binary_pipeline_class, X_y_binary): - mock_objective_score.side_effect = Exception('finna kabooom 💣') + assert "finna kabooom 💣" in e.message + + +@patch("evalml.pipelines.BinaryClassificationPipeline._encode_targets") +@patch("evalml.objectives.F1.score") +@patch("evalml.pipelines.BinaryClassificationPipeline.fit") +@patch("evalml.pipelines.ComponentGraph.predict") +def test_score_nonlinear_binary_objective_error( + mock_predict, + mock_fit, + mock_objective_score, + mock_encode, + nonlinear_binary_pipeline_class, + X_y_binary, +): + mock_objective_score.side_effect = Exception("finna kabooom 💣") X, y = X_y_binary mock_predict.return_value = pd.Series(y) mock_encode.return_value = y clf = nonlinear_binary_pipeline_class({}) clf.fit(X, y) - objective_names = ['f1', 'precision'] + objective_names = ["f1", "precision"] # Using pytest.raises to make sure we error if an error is not thrown. with pytest.raises(PipelineScoreError): _ = clf.score(X, y, objective_names) @@ -613,21 +781,23 @@ def test_score_nonlinear_binary_objective_error(mock_predict, mock_fit, mock_obj _ = clf.score(X, y, objective_names) except PipelineScoreError as e: assert e.scored_successfully == {"Precision": 1.0} - assert 'finna kabooom 💣' in e.message + assert "finna kabooom 💣" in e.message -@patch('evalml.pipelines.MulticlassClassificationPipeline._encode_targets') -@patch('evalml.objectives.F1Micro.score') -@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') -@patch('evalml.pipelines.components.Estimator.predict') -def test_score_multiclass_objective_error(mock_predict, mock_fit, mock_objective_score, mock_encode, X_y_binary): - mock_objective_score.side_effect = Exception('finna kabooom 💣') +@patch("evalml.pipelines.MulticlassClassificationPipeline._encode_targets") +@patch("evalml.objectives.F1Micro.score") +@patch("evalml.pipelines.MulticlassClassificationPipeline.fit") +@patch("evalml.pipelines.components.Estimator.predict") +def test_score_multiclass_objective_error( + mock_predict, mock_fit, mock_objective_score, mock_encode, X_y_binary +): + mock_objective_score.side_effect = Exception("finna kabooom 💣") X, y = X_y_binary mock_predict.return_value = y mock_encode.return_value = y clf = make_mock_multiclass_pipeline() clf.fit(X, y) - objective_names = ['f1 micro', 'precision micro'] + objective_names = ["f1 micro", "precision micro"] # Using pytest.raises to make sure we error if an error is not thrown. with pytest.raises(PipelineScoreError): _ = clf.score(X, y, objective_names) @@ -635,14 +805,20 @@ def test_score_multiclass_objective_error(mock_predict, mock_fit, mock_objective _ = clf.score(X, y, objective_names) except PipelineScoreError as e: assert e.scored_successfully == {"Precision Micro": 1.0} - assert 'finna kabooom 💣' in e.message + assert "finna kabooom 💣" in e.message assert "F1 Micro" in e.exceptions -@patch('evalml.pipelines.components.Imputer.transform') -@patch('evalml.pipelines.components.OneHotEncoder.transform') -@patch('evalml.pipelines.components.StandardScaler.transform') -def test_compute_estimator_features(mock_scaler, mock_ohe, mock_imputer, X_y_binary, logistic_regression_binary_pipeline_class): +@patch("evalml.pipelines.components.Imputer.transform") +@patch("evalml.pipelines.components.OneHotEncoder.transform") +@patch("evalml.pipelines.components.StandardScaler.transform") +def test_compute_estimator_features( + mock_scaler, + mock_ohe, + mock_imputer, + X_y_binary, + logistic_regression_binary_pipeline_class, +): X, y = X_y_binary X = pd.DataFrame(X) X_expected = pd.DataFrame(index=X.index, columns=X.columns).fillna(0) @@ -661,17 +837,26 @@ def test_compute_estimator_features(mock_scaler, mock_ohe, mock_imputer, X_y_bin assert mock_scaler.call_count == 2 -@patch('evalml.pipelines.components.Imputer.transform') -@patch('evalml.pipelines.components.OneHotEncoder.transform') -@patch('evalml.pipelines.components.RandomForestClassifier.predict') -@patch('evalml.pipelines.components.ElasticNetClassifier.predict') -def test_compute_estimator_features_nonlinear(mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, X_y_binary, nonlinear_binary_pipeline_class): +@patch("evalml.pipelines.components.Imputer.transform") +@patch("evalml.pipelines.components.OneHotEncoder.transform") +@patch("evalml.pipelines.components.RandomForestClassifier.predict") +@patch("evalml.pipelines.components.ElasticNetClassifier.predict") +def test_compute_estimator_features_nonlinear( + mock_en_predict, + mock_rf_predict, + mock_ohe, + mock_imputer, + X_y_binary, + nonlinear_binary_pipeline_class, +): X, y = X_y_binary mock_imputer.return_value = pd.DataFrame(X) mock_ohe.return_value = pd.DataFrame(X) mock_en_predict.return_value = pd.Series(np.ones(X.shape[0])) mock_rf_predict.return_value = pd.Series(np.zeros(X.shape[0])) - X_expected_df = pd.DataFrame({'Random Forest': np.zeros(X.shape[0]), 'Elastic Net': np.ones(X.shape[0])}) + X_expected_df = pd.DataFrame( + {"Random Forest": np.zeros(X.shape[0]), "Elastic Net": np.ones(X.shape[0])} + ) pipeline = nonlinear_binary_pipeline_class({}) pipeline.fit(X, y) @@ -687,160 +872,200 @@ def test_compute_estimator_features_nonlinear(mock_en_predict, mock_rf_predict, def test_no_default_parameters(): class MockComponent(Transformer): name = "Mock Component" - hyperparameter_ranges = { - 'a': [0, 1, 2] - } + hyperparameter_ranges = {"a": [0, 1, 2]} - def __init__(self, a, b=1, c='2', random_seed=0): + def __init__(self, a, b=1, c="2", random_seed=0): self.a = a self.b = b self.c = c super().__init__() class TestPipeline(BinaryClassificationPipeline): - component_graph = [MockComponent, 'Logistic Regression Classifier'] + component_graph = [MockComponent, "Logistic Regression Classifier"] def __init__(self, parameters, random_seed=0): super().__init__(self.component_graph, parameters=parameters) - with pytest.raises(ValueError, match="Error received when instantiating component *."): + with pytest.raises( + ValueError, match="Error received when instantiating component *." + ): TestPipeline(parameters={}) - assert TestPipeline(parameters={'Mock Component': {'a': 42}}) + assert TestPipeline(parameters={"Mock Component": {"a": 42}}) def test_init_components_invalid_parameters(): - component_graph = ['RF Classifier Select From Model', 'Logistic Regression Classifier'] - parameters = { - 'Logistic Regression Classifier': { - "cool_parameter": "yes" - } - } + component_graph = [ + "RF Classifier Select From Model", + "Logistic Regression Classifier", + ] + parameters = {"Logistic Regression Classifier": {"cool_parameter": "yes"}} with pytest.raises(ValueError, match="Error received when instantiating component"): - BinaryClassificationPipeline(component_graph=component_graph, parameters=parameters) + BinaryClassificationPipeline( + component_graph=component_graph, parameters=parameters + ) def test_correct_parameters(logistic_regression_binary_pipeline_class): parameters = { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean' + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + }, + "Logistic Regression Classifier": { + "penalty": "l2", + "C": 3.0, }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 3.0, - } } lr_pipeline = logistic_regression_binary_pipeline_class(parameters=parameters) assert lr_pipeline.estimator.random_seed == 0 - assert lr_pipeline.estimator.parameters['C'] == 3.0 - assert lr_pipeline['Imputer'].parameters['categorical_impute_strategy'] == 'most_frequent' - assert lr_pipeline['Imputer'].parameters['numeric_impute_strategy'] == 'mean' + assert lr_pipeline.estimator.parameters["C"] == 3.0 + assert ( + lr_pipeline["Imputer"].parameters["categorical_impute_strategy"] + == "most_frequent" + ) + assert lr_pipeline["Imputer"].parameters["numeric_impute_strategy"] == "mean" def test_correct_nonlinear_parameters(nonlinear_binary_pipeline_class): parameters = { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean' + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", }, - 'OneHot_RandomForest': { - 'top_n': 4 + "OneHot_RandomForest": {"top_n": 4}, + "Logistic Regression": { + "penalty": "l2", + "C": 3.0, }, - 'Logistic Regression': { - 'penalty': 'l2', - 'C': 3.0, - } } nlb_pipeline = nonlinear_binary_pipeline_class(parameters=parameters) assert nlb_pipeline.estimator.random_seed == 0 - assert nlb_pipeline.estimator.parameters['C'] == 3.0 - assert nlb_pipeline['Imputer'].parameters['categorical_impute_strategy'] == 'most_frequent' - assert nlb_pipeline['Imputer'].parameters['numeric_impute_strategy'] == 'mean' - assert nlb_pipeline['OneHot_RandomForest'].parameters['top_n'] == 4 - assert nlb_pipeline['OneHot_ElasticNet'].parameters['top_n'] == 10 + assert nlb_pipeline.estimator.parameters["C"] == 3.0 + assert ( + nlb_pipeline["Imputer"].parameters["categorical_impute_strategy"] + == "most_frequent" + ) + assert nlb_pipeline["Imputer"].parameters["numeric_impute_strategy"] == "mean" + assert nlb_pipeline["OneHot_RandomForest"].parameters["top_n"] == 4 + assert nlb_pipeline["OneHot_ElasticNet"].parameters["top_n"] == 10 -@patch('evalml.pipelines.components.Estimator.predict') -def test_score_with_objective_that_requires_predict_proba(mock_predict, dummy_regression_pipeline_class, X_y_binary): +@patch("evalml.pipelines.components.Estimator.predict") +def test_score_with_objective_that_requires_predict_proba( + mock_predict, dummy_regression_pipeline_class, X_y_binary +): X, y = X_y_binary mock_predict.return_value = pd.Series([1] * 100) # Using pytest.raises to make sure we error if an error is not thrown. with pytest.raises(PipelineScoreError): clf = dummy_regression_pipeline_class(parameters={}) clf.fit(X, y) - clf.score(X, y, ['precision', 'auc']) + clf.score(X, y, ["precision", "auc"]) try: clf = dummy_regression_pipeline_class(parameters={}) clf.fit(X, y) - clf.score(X, y, ['precision', 'auc']) + clf.score(X, y, ["precision", "auc"]) except PipelineScoreError as e: - assert "Invalid objective AUC specified for problem type regression" in e.message - assert "Invalid objective Precision specified for problem type regression" in e.message + assert ( + "Invalid objective AUC specified for problem type regression" in e.message + ) + assert ( + "Invalid objective Precision specified for problem type regression" + in e.message + ) mock_predict.assert_called() def test_score_auc(X_y_binary, logistic_regression_binary_pipeline_class): X, y = X_y_binary - lr_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + lr_pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) lr_pipeline.fit(X, y) - lr_pipeline.score(X, y, ['auc']) + lr_pipeline.score(X, y, ["auc"]) def test_pipeline_summary(): - assert BinaryClassificationPipeline(["Imputer", "One Hot Encoder"]).summary == "Pipeline w/ Imputer + One Hot Encoder" + assert ( + BinaryClassificationPipeline(["Imputer", "One Hot Encoder"]).summary + == "Pipeline w/ Imputer + One Hot Encoder" + ) assert BinaryClassificationPipeline(["Imputer"]).summary == "Pipeline w/ Imputer" - assert BinaryClassificationPipeline(["Random Forest Classifier"]).summary == "Random Forest Classifier" + assert ( + BinaryClassificationPipeline(["Random Forest Classifier"]).summary + == "Random Forest Classifier" + ) assert BinaryClassificationPipeline([]).summary == "Empty Pipeline" - assert BinaryClassificationPipeline(["Imputer", "One Hot Encoder", "Random Forest Classifier"]).summary == "Random Forest Classifier w/ Imputer + One Hot Encoder" + assert ( + BinaryClassificationPipeline( + ["Imputer", "One Hot Encoder", "Random Forest Classifier"] + ).summary + == "Random Forest Classifier w/ Imputer + One Hot Encoder" + ) -def test_nonlinear_pipeline_summary(nonlinear_binary_pipeline_class, nonlinear_multiclass_pipeline_class, nonlinear_regression_pipeline_class): - assert nonlinear_binary_pipeline_class({}).summary == "Logistic Regression Classifier w/ Imputer + One Hot Encoder + One Hot Encoder + Random Forest Classifier + Elastic Net Classifier" - assert nonlinear_multiclass_pipeline_class({}).summary == "Logistic Regression Classifier w/ Imputer + One Hot Encoder + One Hot Encoder + Random Forest Classifier + Elastic Net Classifier" - assert nonlinear_regression_pipeline_class({}).summary == "Linear Regressor w/ Imputer + One Hot Encoder + Random Forest Regressor + Elastic Net Regressor" +def test_nonlinear_pipeline_summary( + nonlinear_binary_pipeline_class, + nonlinear_multiclass_pipeline_class, + nonlinear_regression_pipeline_class, +): + assert ( + nonlinear_binary_pipeline_class({}).summary + == "Logistic Regression Classifier w/ Imputer + One Hot Encoder + One Hot Encoder + Random Forest Classifier + Elastic Net Classifier" + ) + assert ( + nonlinear_multiclass_pipeline_class({}).summary + == "Logistic Regression Classifier w/ Imputer + One Hot Encoder + One Hot Encoder + Random Forest Classifier + Elastic Net Classifier" + ) + assert ( + nonlinear_regression_pipeline_class({}).summary + == "Linear Regressor w/ Imputer + One Hot Encoder + Random Forest Regressor + Elastic Net Regressor" + ) def test_drop_columns_in_pipeline(): parameters = { - 'Drop Columns Transformer': { - 'columns': ["column to drop"] - }, - 'Imputer': { + "Drop Columns Transformer": {"columns": ["column to drop"]}, + "Imputer": { "categorical_impute_strategy": "most_frequent", - "numeric_impute_strategy": "mean" + "numeric_impute_strategy": "mean", }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 3.0, - 'n_jobs': 1 - } + "Logistic Regression Classifier": {"penalty": "l2", "C": 3.0, "n_jobs": 1}, } - pipeline_with_drop_col = BinaryClassificationPipeline(component_graph=['Drop Columns Transformer', 'Imputer', 'Logistic Regression Classifier'], - parameters=parameters) + pipeline_with_drop_col = BinaryClassificationPipeline( + component_graph=[ + "Drop Columns Transformer", + "Imputer", + "Logistic Regression Classifier", + ], + parameters=parameters, + ) X = pd.DataFrame({"column to drop": [1, 0, 1, 3], "other col": [1, 2, 4, 1]}) y = pd.Series([1, 0, 1, 0]) pipeline_with_drop_col.fit(X, y) - pipeline_with_drop_col.score(X, y, ['auc']) - assert list(pipeline_with_drop_col.feature_importance["feature"]) == ['other col'] + pipeline_with_drop_col.score(X, y, ["auc"]) + assert list(pipeline_with_drop_col.feature_importance["feature"]) == ["other col"] @pytest.mark.parametrize("is_linear", [True, False]) -def test_clone_init(is_linear, linear_regression_pipeline_class, nonlinear_regression_pipeline_class): +def test_clone_init( + is_linear, linear_regression_pipeline_class, nonlinear_regression_pipeline_class +): if is_linear: pipeline_class = linear_regression_pipeline_class else: pipeline_class = nonlinear_regression_pipeline_class parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", - "numeric_impute_strategy": "mean" + "numeric_impute_strategy": "mean", + }, + "Linear Regressor": { + "fit_intercept": True, + "normalize": True, }, - 'Linear Regressor': { - 'fit_intercept': True, - 'normalize': True, - } } pipeline = pipeline_class(parameters=parameters, random_seed=42) pipeline_clone = pipeline.clone() @@ -849,13 +1074,20 @@ def test_clone_init(is_linear, linear_regression_pipeline_class, nonlinear_regre @pytest.mark.parametrize("is_linear", [True, False]) -def test_clone_fitted(is_linear, X_y_binary, logistic_regression_binary_pipeline_class, nonlinear_binary_pipeline_class): +def test_clone_fitted( + is_linear, + X_y_binary, + logistic_regression_binary_pipeline_class, + nonlinear_binary_pipeline_class, +): X, y = X_y_binary if is_linear: pipeline_class = logistic_regression_binary_pipeline_class else: pipeline_class = nonlinear_binary_pipeline_class - pipeline = pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_seed=42) + pipeline = pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_seed=42 + ) pipeline.fit(X, y) X_t = pipeline.predict_proba(X) @@ -871,25 +1103,23 @@ def test_clone_fitted(is_linear, X_y_binary, logistic_regression_binary_pipeline assert_frame_equal(X_t, X_t_clone) -def test_feature_importance_has_feature_names(X_y_binary, logistic_regression_binary_pipeline_class): +def test_feature_importance_has_feature_names( + X_y_binary, logistic_regression_binary_pipeline_class +): X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", - "numeric_impute_strategy": "mean" + "numeric_impute_strategy": "mean", }, - 'RF Classifier Select From Model': { + "RF Classifier Select From Model": { "percent_features": 1.0, "number_features": len(X.columns), - "n_estimators": 20 + "n_estimators": 20, }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 1.0, - 'n_jobs': 1 - } + "Logistic Regression Classifier": {"penalty": "l2", "C": 1.0, "n_jobs": 1}, } clf = logistic_regression_binary_pipeline_class(parameters=parameters) @@ -899,50 +1129,58 @@ def test_feature_importance_has_feature_names(X_y_binary, logistic_regression_bi assert sorted(clf.feature_importance["feature"]) == sorted(col_names) -def test_nonlinear_feature_importance_has_feature_names(X_y_binary, nonlinear_binary_pipeline_class): +def test_nonlinear_feature_importance_has_feature_names( + X_y_binary, nonlinear_binary_pipeline_class +): X, y = X_y_binary col_names = ["col_{}".format(i) for i in range(len(X[0]))] X = pd.DataFrame(X, columns=col_names) parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", - "numeric_impute_strategy": "mean" + "numeric_impute_strategy": "mean", }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 1.0, - 'n_jobs': 1 - } + "Logistic Regression Classifier": {"penalty": "l2", "C": 1.0, "n_jobs": 1}, } clf = nonlinear_binary_pipeline_class(parameters=parameters) clf.fit(X, y) assert len(clf.feature_importance) == 2 assert not clf.feature_importance.isnull().all().all() - assert sorted(clf.feature_importance["feature"]) == ['Elastic Net', 'Random Forest'] + assert sorted(clf.feature_importance["feature"]) == ["Elastic Net", "Random Forest"] -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_feature_importance_has_feature_names_xgboost(problem_type, has_minimal_dependencies, - X_y_regression, X_y_binary, X_y_multi): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_feature_importance_has_feature_names_xgboost( + problem_type, has_minimal_dependencies, X_y_regression, X_y_binary, X_y_multi +): # Testing that we store the original feature names since we map to numeric values for XGBoost if has_minimal_dependencies: pytest.skip("Skipping because XGBoost not installed for minimal dependencies") if problem_type == ProblemTypes.REGRESSION: - pipeline = RegressionPipeline(component_graph=['Simple Imputer', 'XGBoost Regressor'], - parameters={'XGBoost Regressor': {'nthread': 1}}) + pipeline = RegressionPipeline( + component_graph=["Simple Imputer", "XGBoost Regressor"], + parameters={"XGBoost Regressor": {"nthread": 1}}, + ) X, y = X_y_regression elif problem_type == ProblemTypes.BINARY: - pipeline = BinaryClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'], - parameters={'XGBoost Classifier': {'nthread': 1}}) + pipeline = BinaryClassificationPipeline( + component_graph=["Simple Imputer", "XGBoost Classifier"], + parameters={"XGBoost Classifier": {"nthread": 1}}, + ) X, y = X_y_binary elif problem_type == ProblemTypes.MULTICLASS: - pipeline = MulticlassClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'], - parameters={'XGBoost Classifier': {'nthread': 1}}) + pipeline = MulticlassClassificationPipeline( + component_graph=["Simple Imputer", "XGBoost Classifier"], + parameters={"XGBoost Classifier": {"nthread": 1}}, + ) X, y = X_y_multi X = pd.DataFrame(X) - X = X.rename(columns={col_name: f'<[{col_name}]' for col_name in X.columns.values}) + X = X.rename(columns={col_name: f"<[{col_name}]" for col_name in X.columns.values}) col_names = X.columns.values pipeline.fit(X, y) assert len(pipeline.feature_importance) == len(X.columns) @@ -952,43 +1190,76 @@ def test_feature_importance_has_feature_names_xgboost(problem_type, has_minimal_ def test_component_not_found(): with pytest.raises(MissingComponentError, match="was not found"): - BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'This Component Does Not Exist', 'Standard Scaler', 'Logistic Regression Classifier']) + BinaryClassificationPipeline( + component_graph=[ + "Imputer", + "One Hot Encoder", + "This Component Does Not Exist", + "Standard Scaler", + "Logistic Regression Classifier", + ] + ) def test_get_default_parameters(logistic_regression_binary_pipeline_class): expected_defaults = { - 'Imputer': { - 'categorical_impute_strategy': 'most_frequent', - 'numeric_impute_strategy': 'mean', - 'categorical_fill_value': None, - 'numeric_fill_value': None + "Imputer": { + "categorical_impute_strategy": "most_frequent", + "numeric_impute_strategy": "mean", + "categorical_fill_value": None, + "numeric_fill_value": None, }, - 'One Hot Encoder': { - 'top_n': 10, - 'features_to_encode': None, - 'categories': None, - 'drop': 'if_binary', - 'handle_unknown': 'ignore', - 'handle_missing': 'error' + "One Hot Encoder": { + "top_n": 10, + "features_to_encode": None, + "categories": None, + "drop": "if_binary", + "handle_unknown": "ignore", + "handle_missing": "error", + }, + "Logistic Regression Classifier": { + "penalty": "l2", + "C": 1.0, + "n_jobs": -1, + "multi_class": "auto", + "solver": "lbfgs", }, - 'Logistic Regression Classifier': { - 'penalty': 'l2', - 'C': 1.0, - 'n_jobs': -1, - 'multi_class': 'auto', - 'solver': 'lbfgs' - } } - assert logistic_regression_binary_pipeline_class({}).default_parameters == expected_defaults + assert ( + logistic_regression_binary_pipeline_class({}).default_parameters + == expected_defaults + ) -@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["li", "np", "pd", "ww"]) @pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -@pytest.mark.parametrize("target_type", ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool', 'category', 'object']) -def test_targets_data_types_classification_pipelines(data_type, problem_type, target_type, all_binary_pipeline_classes, - make_data_type, all_multiclass_pipeline_classes, helper_functions): - if data_type == 'np' and target_type in ['Int64', 'boolean']: - pytest.skip("Skipping test where data type is numpy and target type is nullable dtype") +@pytest.mark.parametrize( + "target_type", + [ + "int16", + "int32", + "int64", + "float16", + "float32", + "float64", + "bool", + "category", + "object", + ], +) +def test_targets_data_types_classification_pipelines( + data_type, + problem_type, + target_type, + all_binary_pipeline_classes, + make_data_type, + all_multiclass_pipeline_classes, + helper_functions, +): + if data_type == "np" and target_type in ["Int64", "boolean"]: + pytest.skip( + "Skipping test where data type is numpy and target type is nullable dtype" + ) if problem_type == ProblemTypes.BINARY: objective = "Log Loss Binary" @@ -998,7 +1269,9 @@ def test_targets_data_types_classification_pipelines(data_type, problem_type, ta y = y.map({"malignant": False, "benign": True}) elif problem_type == ProblemTypes.MULTICLASS: if "bool" in target_type: - pytest.skip("Skipping test where problem type is multiclass but target type is boolean") + pytest.skip( + "Skipping test where problem type is multiclass but target type is boolean" + ) objective = "Log Loss Multiclass" pipeline_classes = all_multiclass_pipeline_classes X, y = load_wine() @@ -1030,20 +1303,34 @@ def test_targets_data_types_classification_pipelines(data_type, problem_type, ta assert set(predict_proba.columns) == set(unique_vals) -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_pipeline_not_fitted_error(problem_type, X_y_binary, X_y_multi, X_y_regression, - logistic_regression_binary_pipeline_class, - logistic_regression_multiclass_pipeline_class, - linear_regression_pipeline_class): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_pipeline_not_fitted_error( + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, + logistic_regression_binary_pipeline_class, + logistic_regression_multiclass_pipeline_class, + linear_regression_pipeline_class, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - clf = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + clf = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi - clf = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + clf = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression - clf = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) + clf = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) with pytest.raises(PipelineNotYetFittedError): clf.predict(X) @@ -1057,26 +1344,26 @@ def test_pipeline_not_fitted_error(problem_type, X_y_binary, X_y_multi, X_y_regr clf.fit(X, y) if is_classification(problem_type): - to_patch = 'evalml.pipelines.ClassificationPipeline._predict' + to_patch = "evalml.pipelines.ClassificationPipeline._predict" if problem_type == ProblemTypes.BINARY: - to_patch = 'evalml.pipelines.BinaryClassificationPipeline._predict' + to_patch = "evalml.pipelines.BinaryClassificationPipeline._predict" with patch(to_patch) as mock_predict: clf.predict(X) mock_predict.assert_called() _, kwargs = mock_predict.call_args - assert kwargs['objective'] is None + assert kwargs["objective"] is None mock_predict.reset_mock() - clf.predict(X, 'Log Loss Binary') + clf.predict(X, "Log Loss Binary") mock_predict.assert_called() _, kwargs = mock_predict.call_args - assert kwargs['objective'] is not None + assert kwargs["objective"] is not None mock_predict.reset_mock() - clf.predict(X, objective='Log Loss Binary') + clf.predict(X, objective="Log Loss Binary") mock_predict.assert_called() _, kwargs = mock_predict.call_args - assert kwargs['objective'] is not None + assert kwargs["objective"] is not None clf.predict_proba(X) else: @@ -1084,21 +1371,36 @@ def test_pipeline_not_fitted_error(problem_type, X_y_binary, X_y_multi, X_y_regr clf.feature_importance -@patch('evalml.pipelines.PipelineBase.fit') -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_nonlinear_pipeline_not_fitted_error(mock_fit, problem_type, X_y_binary, X_y_multi, X_y_regression, - nonlinear_binary_pipeline_class, - nonlinear_multiclass_pipeline_class, - nonlinear_regression_pipeline_class): +@patch("evalml.pipelines.PipelineBase.fit") +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_nonlinear_pipeline_not_fitted_error( + mock_fit, + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, + nonlinear_binary_pipeline_class, + nonlinear_multiclass_pipeline_class, + nonlinear_regression_pipeline_class, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - clf = nonlinear_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + clf = nonlinear_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi - clf = nonlinear_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + clf = nonlinear_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression - clf = nonlinear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) + clf = nonlinear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) with pytest.raises(PipelineNotYetFittedError): clf.predict(X) @@ -1111,80 +1413,130 @@ def test_nonlinear_pipeline_not_fitted_error(mock_fit, problem_type, X_y_binary, clf.fit(X, y) if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: - with patch('evalml.pipelines.ClassificationPipeline.predict') as mock_predict: + with patch("evalml.pipelines.ClassificationPipeline.predict") as mock_predict: clf.predict(X) mock_predict.assert_called() - with patch('evalml.pipelines.ClassificationPipeline.predict_proba') as mock_predict_proba: + with patch( + "evalml.pipelines.ClassificationPipeline.predict_proba" + ) as mock_predict_proba: clf.predict_proba(X) mock_predict_proba.assert_called() else: - with patch('evalml.pipelines.RegressionPipeline.predict') as mock_predict: + with patch("evalml.pipelines.RegressionPipeline.predict") as mock_predict: clf.predict(X) mock_predict.assert_called() clf.feature_importance -@pytest.mark.parametrize("pipeline_class", [BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline]) +@pytest.mark.parametrize( + "pipeline_class", + [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + ], +) def test_pipeline_equality_different_attributes(pipeline_class): # Tests that two classes which are equivalent are not equal - if pipeline_class in [BinaryClassificationPipeline, MulticlassClassificationPipeline]: - final_estimator = 'Random Forest Classifier' + if pipeline_class in [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + ]: + final_estimator = "Random Forest Classifier" else: - final_estimator = 'Random Forest Regressor' + final_estimator = "Random Forest Regressor" class MockPipeline(pipeline_class): custom_name = "Mock Pipeline" - component_graph = ['Imputer', final_estimator] + component_graph = ["Imputer", final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) class MockPipelineWithADifferentClassName(pipeline_class): custom_name = "Mock Pipeline" - component_graph = ['Imputer', final_estimator] + component_graph = ["Imputer", final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) - - assert MockPipeline(parameters={}) != MockPipelineWithADifferentClassName(parameters={}) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) + + assert MockPipeline(parameters={}) != MockPipelineWithADifferentClassName( + parameters={} + ) -@pytest.mark.parametrize("pipeline_class", [BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline]) +@pytest.mark.parametrize( + "pipeline_class", + [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + ], +) def test_pipeline_equality_subclasses(pipeline_class): - if pipeline_class in [BinaryClassificationPipeline, MulticlassClassificationPipeline]: - final_estimator = 'Random Forest Classifier' + if pipeline_class in [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + ]: + final_estimator = "Random Forest Classifier" else: - final_estimator = 'Random Forest Regressor' + final_estimator = "Random Forest Regressor" class MockPipeline(pipeline_class): custom_name = "Mock Pipeline" - component_graph = ['Imputer', final_estimator] + component_graph = ["Imputer", final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) class MockPipelineSubclass(MockPipeline): pass + assert MockPipeline(parameters={}) != MockPipelineSubclass(parameters={}) -@pytest.mark.parametrize("pipeline_class", [BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline]) -@patch('evalml.pipelines.ComponentGraph.fit') +@pytest.mark.parametrize( + "pipeline_class", + [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + ], +) +@patch("evalml.pipelines.ComponentGraph.fit") def test_pipeline_equality(mock_fit, pipeline_class): - if pipeline_class in [BinaryClassificationPipeline, MulticlassClassificationPipeline]: - final_estimator = 'Random Forest Classifier' + if pipeline_class in [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + ]: + final_estimator = "Random Forest Classifier" else: - final_estimator = 'Random Forest Regressor' + final_estimator = "Random Forest Regressor" parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", "numeric_impute_strategy": "mean", } } different_parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "constant", "numeric_impute_strategy": "mean", } @@ -1192,10 +1544,15 @@ def test_pipeline_equality(mock_fit, pipeline_class): class MockPipeline(pipeline_class): custom_name = "Mock Pipeline" - component_graph = ['Imputer', final_estimator] + component_graph = ["Imputer", final_estimator] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) # Test self-equality mock_pipeline = MockPipeline(parameters={}) @@ -1205,11 +1562,17 @@ def __init__(self, parameters, random_seed=0): assert MockPipeline(parameters={}) == MockPipeline(parameters={}) # Test random_seed - assert MockPipeline(parameters={}, random_seed=10) == MockPipeline(parameters={}, random_seed=10) - assert MockPipeline(parameters={}, random_seed=10) != MockPipeline(parameters={}, random_seed=0) + assert MockPipeline(parameters={}, random_seed=10) == MockPipeline( + parameters={}, random_seed=10 + ) + assert MockPipeline(parameters={}, random_seed=10) != MockPipeline( + parameters={}, random_seed=0 + ) # Test parameters - assert MockPipeline(parameters=parameters) != MockPipeline(parameters=different_parameters) + assert MockPipeline(parameters=parameters) != MockPipeline( + parameters=different_parameters + ) # Test fitted equality X = pd.DataFrame({}) @@ -1223,51 +1586,67 @@ def __init__(self, parameters, random_seed=0): # Test fitted equality: same data but different target names are not equal mock_pipeline_different_target_name = MockPipeline(parameters={}) - mock_pipeline_different_target_name.fit(X, y=pd.Series([], name="target with a name")) + mock_pipeline_different_target_name.fit( + X, y=pd.Series([], name="target with a name") + ) assert mock_pipeline != mock_pipeline_different_target_name -@pytest.mark.parametrize("pipeline_class", [BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline]) +@pytest.mark.parametrize( + "pipeline_class", + [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + ], +) def test_nonlinear_pipeline_equality(pipeline_class): - if pipeline_class in [BinaryClassificationPipeline, MulticlassClassificationPipeline]: - final_estimator = 'Random Forest Classifier' + if pipeline_class in [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + ]: + final_estimator = "Random Forest Classifier" else: - final_estimator = 'Random Forest Regressor' + final_estimator = "Random Forest Regressor" parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "most_frequent", "numeric_impute_strategy": "mean", }, - 'OHE_1': { - 'top_n': 5 - } + "OHE_1": {"top_n": 5}, } different_parameters = { - 'Imputer': { + "Imputer": { "categorical_impute_strategy": "constant", "numeric_impute_strategy": "mean", }, - 'OHE_2': { - 'top_n': 7, - } + "OHE_2": { + "top_n": 7, + }, } class MockPipeline(pipeline_class): custom_name = "Mock Pipeline" component_graph = { - 'Imputer': ['Imputer'], - 'OHE_1': ['One Hot Encoder', 'Imputer'], - 'OHE_2': ['One Hot Encoder', 'Imputer'], - 'Estimator': [final_estimator, 'OHE_1', 'OHE_2'] + "Imputer": ["Imputer"], + "OHE_1": ["One Hot Encoder", "Imputer"], + "OHE_2": ["One Hot Encoder", "Imputer"], + "Estimator": [final_estimator, "OHE_1", "OHE_2"], } def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) def fit(self, X, y=None): return self + # Test self-equality mock_pipeline = MockPipeline(parameters={}) assert mock_pipeline == mock_pipeline @@ -1276,11 +1655,17 @@ def fit(self, X, y=None): assert MockPipeline(parameters={}) == MockPipeline(parameters={}) # Test random_seed - assert MockPipeline(parameters={}, random_seed=10) == MockPipeline(parameters={}, random_seed=10) - assert MockPipeline(parameters={}, random_seed=10) != MockPipeline(parameters={}, random_seed=0) + assert MockPipeline(parameters={}, random_seed=10) == MockPipeline( + parameters={}, random_seed=10 + ) + assert MockPipeline(parameters={}, random_seed=10) != MockPipeline( + parameters={}, random_seed=0 + ) # Test parameters - assert MockPipeline(parameters=parameters) != MockPipeline(parameters=different_parameters) + assert MockPipeline(parameters=parameters) != MockPipeline( + parameters=different_parameters + ) # Test fitted equality X = pd.DataFrame({}) @@ -1292,20 +1677,34 @@ def fit(self, X, y=None): assert mock_pipeline == mock_pipeline_equal -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) -def test_pipeline_equality_different_fitted_data(problem_type, X_y_binary, X_y_multi, X_y_regression, - linear_regression_pipeline_class, - logistic_regression_binary_pipeline_class, - logistic_regression_multiclass_pipeline_class): +@pytest.mark.parametrize( + "problem_type", + [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION], +) +def test_pipeline_equality_different_fitted_data( + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, + linear_regression_pipeline_class, + logistic_regression_binary_pipeline_class, + logistic_regression_multiclass_pipeline_class, +): # Test fitted on different data if problem_type == ProblemTypes.BINARY: - pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) X, y = X_y_binary elif problem_type == ProblemTypes.MULTICLASS: - pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + pipeline = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) X, y = X_y_multi elif problem_type == ProblemTypes.REGRESSION: - pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) + pipeline = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) X, y = X_y_regression pipeline_diff_data = pipeline.clone() @@ -1320,27 +1719,41 @@ def test_pipeline_equality_different_fitted_data(problem_type, X_y_binary, X_y_m def test_pipeline_str(): - class MockBinaryPipeline(BinaryClassificationPipeline): custom_name = "Mock Binary Pipeline" - component_graph = ['Imputer', 'Random Forest Classifier'] + component_graph = ["Imputer", "Random Forest Classifier"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) class MockMulticlassPipeline(MulticlassClassificationPipeline): custom_name = "Mock Multiclass Pipeline" - component_graph = ['Imputer', 'Random Forest Classifier'] + component_graph = ["Imputer", "Random Forest Classifier"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) class MockRegressionPipeline(RegressionPipeline): custom_name = "Mock Regression Pipeline" - component_graph = ['Imputer', 'Random Forest Regressor'] + component_graph = ["Imputer", "Random Forest Regressor"] def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters, custom_name=self.custom_name, random_seed=random_seed) + super().__init__( + self.component_graph, + parameters=parameters, + custom_name=self.custom_name, + random_seed=random_seed, + ) binary_pipeline = MockBinaryPipeline(parameters={}) multiclass_pipeline = MockMulticlassPipeline(parameters={}) @@ -1351,115 +1764,228 @@ def __init__(self, parameters, random_seed=0): assert str(regression_pipeline) == "Mock Regression Pipeline" -@pytest.mark.parametrize("pipeline_class", [BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline]) +@pytest.mark.parametrize( + "pipeline_class", + [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + ], +) def test_pipeline_repr(pipeline_class): - if pipeline_class in [BinaryClassificationPipeline, MulticlassClassificationPipeline]: - final_estimator = 'Random Forest Classifier' + if pipeline_class in [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + ]: + final_estimator = "Random Forest Classifier" else: - final_estimator = 'Random Forest Regressor' + final_estimator = "Random Forest Regressor" custom_name = "Mock Pipeline" - component_graph = ['Imputer', final_estimator] + component_graph = ["Imputer", final_estimator] pipeline = pipeline_class(component_graph=component_graph, custom_name=custom_name) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " \ - f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, " \ + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " + f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, " "custom_name='Mock Pipeline', random_seed=0)" + ) assert repr(pipeline) == expected_repr - pipeline_with_parameters = pipeline_class(component_graph=component_graph, parameters={'Imputer': {'numeric_fill_value': 42}}, custom_name=custom_name) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " \ - f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': 42}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, " \ + pipeline_with_parameters = pipeline_class( + component_graph=component_graph, + parameters={"Imputer": {"numeric_fill_value": 42}}, + custom_name=custom_name, + ) + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " + f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': 42}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, " "custom_name='Mock Pipeline', random_seed=0)" + ) assert repr(pipeline_with_parameters) == expected_repr - pipeline_with_inf_parameters = pipeline_class(component_graph=component_graph, parameters={'Imputer': {'numeric_fill_value': float('inf'), 'categorical_fill_value': np.inf}}) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " \ + pipeline_with_inf_parameters = pipeline_class( + component_graph=component_graph, + parameters={ + "Imputer": { + "numeric_fill_value": float("inf"), + "categorical_fill_value": np.inf, + } + }, + ) + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': float('inf'), 'numeric_fill_value': float('inf')}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, random_seed=0)" + ) assert repr(pipeline_with_inf_parameters) == expected_repr - pipeline_with_nan_parameters = pipeline_class(component_graph=component_graph, parameters={'Imputer': {'numeric_fill_value': float('nan'), 'categorical_fill_value': np.nan}}) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " \ + pipeline_with_nan_parameters = pipeline_class( + component_graph=component_graph, + parameters={ + "Imputer": { + "numeric_fill_value": float("nan"), + "categorical_fill_value": np.nan, + } + }, + ) + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', '{final_estimator}'], " f"parameters={{'Imputer':{{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': np.nan, 'numeric_fill_value': np.nan}}, '{final_estimator}':{{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}}}, random_seed=0)" + ) assert repr(pipeline_with_nan_parameters) == expected_repr -@pytest.mark.parametrize("pipeline_class", [BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline]) +@pytest.mark.parametrize( + "pipeline_class", + [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + ], +) def test_nonlinear_pipeline_repr(pipeline_class): - if pipeline_class in [BinaryClassificationPipeline, MulticlassClassificationPipeline]: - final_estimator = 'Random Forest Classifier' + if pipeline_class in [ + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + ]: + final_estimator = "Random Forest Classifier" else: - final_estimator = 'Random Forest Regressor' + final_estimator = "Random Forest Regressor" custom_name = "Mock Pipeline" component_graph = { - 'Imputer': ['Imputer'], - 'OHE_1': ['One Hot Encoder', 'Imputer'], - 'OHE_2': ['One Hot Encoder', 'Imputer'], - 'Estimator': [final_estimator, 'OHE_1', 'OHE_2'] + "Imputer": ["Imputer"], + "OHE_1": ["One Hot Encoder", "Imputer"], + "OHE_2": ["One Hot Encoder", "Imputer"], + "Estimator": [final_estimator, "OHE_1", "OHE_2"], } pipeline = pipeline_class(component_graph=component_graph, custom_name=custom_name) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \ - "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ - "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " + "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " + "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " "'Estimator':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Pipeline', random_seed=0)" + ) assert repr(pipeline) == expected_repr - pipeline_with_parameters = pipeline_class(component_graph=component_graph, custom_name=custom_name, parameters={'Imputer': {'numeric_fill_value': 42}}) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': 42}, " \ - "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ - "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ + pipeline_with_parameters = pipeline_class( + component_graph=component_graph, + custom_name=custom_name, + parameters={"Imputer": {"numeric_fill_value": 42}}, + ) + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': 42}, " + "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " + "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " "'Estimator':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Pipeline', random_seed=0)" + ) assert repr(pipeline_with_parameters) == expected_repr - pipeline_with_inf_parameters = pipeline_class(component_graph=component_graph, custom_name=custom_name, parameters={'Imputer': {'numeric_fill_value': float('inf'), 'categorical_fill_value': np.inf}}) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': float('inf'), 'numeric_fill_value': float('inf')}, " \ - "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ - "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ + pipeline_with_inf_parameters = pipeline_class( + component_graph=component_graph, + custom_name=custom_name, + parameters={ + "Imputer": { + "numeric_fill_value": float("inf"), + "categorical_fill_value": np.inf, + } + }, + ) + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': float('inf'), 'numeric_fill_value': float('inf')}, " + "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " + "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " "'Estimator':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Pipeline', random_seed=0)" + ) assert repr(pipeline_with_inf_parameters) == expected_repr - pipeline_with_nan_parameters = pipeline_class(component_graph=component_graph, custom_name=custom_name, parameters={'Imputer': {'numeric_fill_value': float('nan'), 'categorical_fill_value': np.nan}}) - expected_repr = f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " \ - "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': np.nan, 'numeric_fill_value': np.nan}, " \ - "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ - "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " \ + pipeline_with_nan_parameters = pipeline_class( + component_graph=component_graph, + custom_name=custom_name, + parameters={ + "Imputer": { + "numeric_fill_value": float("nan"), + "categorical_fill_value": np.nan, + } + }, + ) + expected_repr = ( + f"pipeline = {pipeline_class.__name__}(component_graph=['Imputer', 'OHE_1', 'OHE_2', 'Estimator'], " + "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': np.nan, 'numeric_fill_value': np.nan}, " + "'OHE_1':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " + "'OHE_2':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, " "'Estimator':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Pipeline', random_seed=0)" + ) assert repr(pipeline_with_nan_parameters) == expected_repr -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION, - ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS]) -def test_predict_has_input_target_name(problem_type, X_y_binary, X_y_multi, X_y_regression, ts_data, - logistic_regression_binary_pipeline_class, logistic_regression_multiclass_pipeline_class, linear_regression_pipeline_class, time_series_regression_pipeline_class, time_series_binary_classification_pipeline_class, - time_series_multiclass_classification_pipeline_class): +@pytest.mark.parametrize( + "problem_type", + [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ], +) +def test_predict_has_input_target_name( + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, + ts_data, + logistic_regression_binary_pipeline_class, + logistic_regression_multiclass_pipeline_class, + linear_regression_pipeline_class, + time_series_regression_pipeline_class, + time_series_binary_classification_pipeline_class, + time_series_multiclass_classification_pipeline_class, +): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - clf = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + clf = logistic_regression_binary_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi - clf = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}) + clf = logistic_regression_multiclass_pipeline_class( + parameters={"Logistic Regression Classifier": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression - clf = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}) + clf = linear_regression_pipeline_class( + parameters={"Linear Regressor": {"n_jobs": 1}} + ) elif problem_type == ProblemTypes.TIME_SERIES_REGRESSION: X, y = ts_data - clf = time_series_regression_pipeline_class(parameters={"pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + clf = time_series_regression_pipeline_class( + parameters={"pipeline": {"gap": 0, "max_delay": 0, "date_index": None}} + ) elif problem_type == ProblemTypes.TIME_SERIES_BINARY: X, y = X_y_binary - clf = time_series_binary_classification_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + clf = time_series_binary_classification_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) elif problem_type == ProblemTypes.TIME_SERIES_MULTICLASS: X, y = X_y_multi - clf = time_series_multiclass_classification_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + clf = time_series_multiclass_classification_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) y = pd.Series(y, name="test target name") clf.fit(X, y) if is_time_series(problem_type): @@ -1470,7 +1996,12 @@ def test_predict_has_input_target_name(problem_type, X_y_binary, X_y_multi, X_y_ def test_linear_pipeline_iteration(logistic_regression_binary_pipeline_class): - expected_order = [Imputer(), OneHotEncoder(), StandardScaler(), LogisticRegressionClassifier()] + expected_order = [ + Imputer(), + OneHotEncoder(), + StandardScaler(), + LogisticRegressionClassifier(), + ] pipeline = logistic_regression_binary_pipeline_class({}) order = [c for c in pipeline] @@ -1479,9 +2010,19 @@ def test_linear_pipeline_iteration(logistic_regression_binary_pipeline_class): assert order == expected_order assert order_again == expected_order - expected_order_params = [Imputer(numeric_impute_strategy='median'), OneHotEncoder(top_n=2), StandardScaler(), LogisticRegressionClassifier()] - - pipeline = logistic_regression_binary_pipeline_class({'One Hot Encoder': {'top_n': 2}, 'Imputer': {'numeric_impute_strategy': 'median'}}) + expected_order_params = [ + Imputer(numeric_impute_strategy="median"), + OneHotEncoder(top_n=2), + StandardScaler(), + LogisticRegressionClassifier(), + ] + + pipeline = logistic_regression_binary_pipeline_class( + { + "One Hot Encoder": {"top_n": 2}, + "Imputer": {"numeric_impute_strategy": "median"}, + } + ) order_params = [c for c in pipeline] order_again_params = [c for c in pipeline] @@ -1490,7 +2031,14 @@ def test_linear_pipeline_iteration(logistic_regression_binary_pipeline_class): def test_nonlinear_pipeline_iteration(nonlinear_binary_pipeline_class): - expected_order = [Imputer(), OneHotEncoder(), ElasticNetClassifier(), OneHotEncoder(), RandomForestClassifier(), LogisticRegressionClassifier()] + expected_order = [ + Imputer(), + OneHotEncoder(), + ElasticNetClassifier(), + OneHotEncoder(), + RandomForestClassifier(), + LogisticRegressionClassifier(), + ] pipeline = nonlinear_binary_pipeline_class({}) order = [c for c in pipeline] @@ -1499,9 +2047,18 @@ def test_nonlinear_pipeline_iteration(nonlinear_binary_pipeline_class): assert order == expected_order assert order_again == expected_order - expected_order_params = [Imputer(), OneHotEncoder(top_n=2), ElasticNetClassifier(), OneHotEncoder(top_n=5), RandomForestClassifier(), LogisticRegressionClassifier()] - - pipeline = nonlinear_binary_pipeline_class({'OneHot_ElasticNet': {'top_n': 2}, 'OneHot_RandomForest': {'top_n': 5}}) + expected_order_params = [ + Imputer(), + OneHotEncoder(top_n=2), + ElasticNetClassifier(), + OneHotEncoder(top_n=5), + RandomForestClassifier(), + LogisticRegressionClassifier(), + ] + + pipeline = nonlinear_binary_pipeline_class( + {"OneHot_ElasticNet": {"top_n": 2}, "OneHot_RandomForest": {"top_n": 5}} + ) order_params = [c for c in pipeline] order_again_params = [c for c in pipeline] @@ -1510,21 +2067,23 @@ def test_nonlinear_pipeline_iteration(nonlinear_binary_pipeline_class): def test_linear_getitem(logistic_regression_binary_pipeline_class): - pipeline = logistic_regression_binary_pipeline_class({'One Hot Encoder': {'top_n': 4}}) + pipeline = logistic_regression_binary_pipeline_class( + {"One Hot Encoder": {"top_n": 4}} + ) assert pipeline[0] == Imputer() assert pipeline[1] == OneHotEncoder(top_n=4) assert pipeline[2] == StandardScaler() assert pipeline[3] == LogisticRegressionClassifier() - assert pipeline['Imputer'] == Imputer() - assert pipeline['One Hot Encoder'] == OneHotEncoder(top_n=4) - assert pipeline['Standard Scaler'] == StandardScaler() - assert pipeline['Logistic Regression Classifier'] == LogisticRegressionClassifier() + assert pipeline["Imputer"] == Imputer() + assert pipeline["One Hot Encoder"] == OneHotEncoder(top_n=4) + assert pipeline["Standard Scaler"] == StandardScaler() + assert pipeline["Logistic Regression Classifier"] == LogisticRegressionClassifier() def test_nonlinear_getitem(nonlinear_binary_pipeline_class): - pipeline = nonlinear_binary_pipeline_class({'OneHot_RandomForest': {'top_n': 4}}) + pipeline = nonlinear_binary_pipeline_class({"OneHot_RandomForest": {"top_n": 4}}) assert pipeline[0] == Imputer() assert pipeline[1] == OneHotEncoder() @@ -1533,36 +2092,49 @@ def test_nonlinear_getitem(nonlinear_binary_pipeline_class): assert pipeline[4] == RandomForestClassifier() assert pipeline[5] == LogisticRegressionClassifier() - assert pipeline['Imputer'] == Imputer() - assert pipeline['OneHot_ElasticNet'] == OneHotEncoder() - assert pipeline['Elastic Net'] == ElasticNetClassifier() - assert pipeline['OneHot_RandomForest'] == OneHotEncoder(top_n=4) - assert pipeline['Random Forest'] == RandomForestClassifier() - assert pipeline['Logistic Regression'] == LogisticRegressionClassifier() + assert pipeline["Imputer"] == Imputer() + assert pipeline["OneHot_ElasticNet"] == OneHotEncoder() + assert pipeline["Elastic Net"] == ElasticNetClassifier() + assert pipeline["OneHot_RandomForest"] == OneHotEncoder(top_n=4) + assert pipeline["Random Forest"] == RandomForestClassifier() + assert pipeline["Logistic Regression"] == LogisticRegressionClassifier() -def test_get_component(logistic_regression_binary_pipeline_class, nonlinear_binary_pipeline_class): - pipeline = logistic_regression_binary_pipeline_class({'One Hot Encoder': {'top_n': 4}}) +def test_get_component( + logistic_regression_binary_pipeline_class, nonlinear_binary_pipeline_class +): + pipeline = logistic_regression_binary_pipeline_class( + {"One Hot Encoder": {"top_n": 4}} + ) - assert pipeline.get_component('Imputer') == Imputer() - assert pipeline.get_component('One Hot Encoder') == OneHotEncoder(top_n=4) - assert pipeline.get_component('Standard Scaler') == StandardScaler() - assert pipeline.get_component('Logistic Regression Classifier') == LogisticRegressionClassifier() + assert pipeline.get_component("Imputer") == Imputer() + assert pipeline.get_component("One Hot Encoder") == OneHotEncoder(top_n=4) + assert pipeline.get_component("Standard Scaler") == StandardScaler() + assert ( + pipeline.get_component("Logistic Regression Classifier") + == LogisticRegressionClassifier() + ) - pipeline = nonlinear_binary_pipeline_class({'OneHot_RandomForest': {'top_n': 4}}) + pipeline = nonlinear_binary_pipeline_class({"OneHot_RandomForest": {"top_n": 4}}) - assert pipeline.get_component('Imputer') == Imputer() - assert pipeline.get_component('OneHot_ElasticNet') == OneHotEncoder() - assert pipeline.get_component('Elastic Net') == ElasticNetClassifier() - assert pipeline.get_component('OneHot_RandomForest') == OneHotEncoder(top_n=4) - assert pipeline.get_component('Random Forest') == RandomForestClassifier() - assert pipeline.get_component('Logistic Regression') == LogisticRegressionClassifier() + assert pipeline.get_component("Imputer") == Imputer() + assert pipeline.get_component("OneHot_ElasticNet") == OneHotEncoder() + assert pipeline.get_component("Elastic Net") == ElasticNetClassifier() + assert pipeline.get_component("OneHot_RandomForest") == OneHotEncoder(top_n=4) + assert pipeline.get_component("Random Forest") == RandomForestClassifier() + assert ( + pipeline.get_component("Logistic Regression") == LogisticRegressionClassifier() + ) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) -def test_score_error_when_custom_objective_not_instantiated(problem_type, logistic_regression_binary_pipeline_class, - dummy_multiclass_pipeline_class, - dummy_regression_pipeline_class, X_y_binary): +def test_score_error_when_custom_objective_not_instantiated( + problem_type, + logistic_regression_binary_pipeline_class, + dummy_multiclass_pipeline_class, + dummy_regression_pipeline_class, + X_y_binary, +): pipeline = dummy_regression_pipeline_class({}) if is_binary(problem_type): pipeline = logistic_regression_binary_pipeline_class({}) @@ -1576,7 +2148,9 @@ def test_score_error_when_custom_objective_not_instantiated(problem_type, logist pipeline.score(X, y, objectives=["cost benefit matrix", "F1"]) # Verify ObjectiveCreationError only raised when string matches an existing objective - with pytest.raises(ObjectiveNotFoundError, match="cost benefit is not a valid Objective!"): + with pytest.raises( + ObjectiveNotFoundError, match="cost benefit is not a valid Objective!" + ): pipeline.score(X, y, objectives=["cost benefit", "F1"]) # Verify no exception when objective properly specified @@ -1585,17 +2159,29 @@ def test_score_error_when_custom_objective_not_instantiated(problem_type, logist @pytest.mark.parametrize("is_time_series", [True, False]) -def test_binary_pipeline_string_target_thresholding(is_time_series, make_data_type, time_series_binary_classification_pipeline_class, - logistic_regression_binary_pipeline_class, - X_y_binary): +def test_binary_pipeline_string_target_thresholding( + is_time_series, + make_data_type, + time_series_binary_classification_pipeline_class, + logistic_regression_binary_pipeline_class, + X_y_binary, +): X, y = X_y_binary - X = make_data_type('ww', X) + X = make_data_type("ww", X) y = ww.init_series(pd.Series([f"String value {i}" for i in y]), "Categorical") objective = get_objective("F1", return_instance=True) - pipeline_class = time_series_binary_classification_pipeline_class if is_time_series else logistic_regression_binary_pipeline_class + pipeline_class = ( + time_series_binary_classification_pipeline_class + if is_time_series + else logistic_regression_binary_pipeline_class + ) - pipeline = pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 1, "date_index": None}}) + pipeline = pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 1, "date_index": None}, + } + ) pipeline.fit(X, y) assert pipeline.threshold is None pred_proba = pipeline.predict_proba(X, y).iloc[:, 1] @@ -1605,10 +2191,11 @@ def test_binary_pipeline_string_target_thresholding(is_time_series, make_data_ty @patch("evalml.pipelines.components.LogisticRegressionClassifier.fit") def test_undersampler_component_in_pipeline_fit(mock_fit): - X = pd.DataFrame({"a": [i for i in range(1000)], - "b": [i % 3 for i in range(1000)]}) + X = pd.DataFrame({"a": [i for i in range(1000)], "b": [i % 3 for i in range(1000)]}) y = pd.Series([0] * 100 + [1] * 900) - pipeline = BinaryClassificationPipeline(['Imputer', 'Undersampler', 'Logistic Regression Classifier']) + pipeline = BinaryClassificationPipeline( + ["Imputer", "Undersampler", "Logistic Regression Classifier"] + ) pipeline.fit(X, y) # make sure we undersample to 500 values in the X and y assert len(mock_fit.call_args[0][0]) == 500 @@ -1621,10 +2208,11 @@ def test_undersampler_component_in_pipeline_fit(mock_fit): def test_undersampler_component_in_pipeline_predict(): - X = pd.DataFrame({"a": [i for i in range(1000)], - "b": [i % 3 for i in range(1000)]}) + X = pd.DataFrame({"a": [i for i in range(1000)], "b": [i % 3 for i in range(1000)]}) y = pd.Series([0] * 100 + [1] * 900) - pipeline = BinaryClassificationPipeline(['Imputer', 'Undersampler', 'Logistic Regression Classifier']) + pipeline = BinaryClassificationPipeline( + ["Imputer", "Undersampler", "Logistic Regression Classifier"] + ) pipeline.fit(X, y) preds = pipeline.predict(X) assert len(preds) == 1000 @@ -1632,17 +2220,28 @@ def test_undersampler_component_in_pipeline_predict(): assert len(preds) == 1000 -@pytest.mark.parametrize('oversampler', ['SMOTE Oversampler', 'SMOTENC Oversampler', 'SMOTEN Oversampler']) +@pytest.mark.parametrize( + "oversampler", ["SMOTE Oversampler", "SMOTENC Oversampler", "SMOTEN Oversampler"] +) @patch("evalml.pipelines.components.LogisticRegressionClassifier.fit") def test_oversampler_component_in_pipeline_fit(mock_fit, oversampler): - pytest.importorskip('imblearn.over_sampling', reason='Skipping test because imbalanced-learn not installed') + pytest.importorskip( + "imblearn.over_sampling", + reason="Skipping test because imbalanced-learn not installed", + ) - X = pd.DataFrame({"a": [i for i in range(1000)], - "b": [i % 3 for i in range(1000)], - "c": [i % 7 for i in range(1, 1001)]}) + X = pd.DataFrame( + { + "a": [i for i in range(1000)], + "b": [i % 3 for i in range(1000)], + "c": [i % 7 for i in range(1, 1001)], + } + ) X.ww.init(logical_types={"c": "Categorical"}) y = pd.Series([0] * 100 + [1] * 900) - pipeline = BinaryClassificationPipeline(['Imputer', oversampler, 'Logistic Regression Classifier']) + pipeline = BinaryClassificationPipeline( + ["Imputer", oversampler, "Logistic Regression Classifier"] + ) pipeline.fit(X, y) # make sure we oversample 0 to 225 values values in the X and y assert len(mock_fit.call_args[0][0]) == 1125 @@ -1654,15 +2253,26 @@ def test_oversampler_component_in_pipeline_fit(mock_fit, oversampler): assert len(mock_fit.call_args[0][0]) == 1000 -@pytest.mark.parametrize('oversampler', ['SMOTE Oversampler', 'SMOTENC Oversampler', 'SMOTEN Oversampler']) +@pytest.mark.parametrize( + "oversampler", ["SMOTE Oversampler", "SMOTENC Oversampler", "SMOTEN Oversampler"] +) def test_oversampler_component_in_pipeline_predict(oversampler): - pytest.importorskip('imblearn.over_sampling', reason='Skipping test because imbalanced-learn not installed') - X = pd.DataFrame({"a": [i for i in range(1000)], - "b": [i % 3 for i in range(1000)], - "c": [i % 7 for i in range(1, 1001)]}) + pytest.importorskip( + "imblearn.over_sampling", + reason="Skipping test because imbalanced-learn not installed", + ) + X = pd.DataFrame( + { + "a": [i for i in range(1000)], + "b": [i % 3 for i in range(1000)], + "c": [i % 7 for i in range(1, 1001)], + } + ) X.ww.init(logical_types={"c": "Categorical"}) y = pd.Series([0] * 100 + [1] * 900) - pipeline = BinaryClassificationPipeline(['Imputer', oversampler, 'Logistic Regression Classifier']) + pipeline = BinaryClassificationPipeline( + ["Imputer", oversampler, "Logistic Regression Classifier"] + ) pipeline.fit(X, y) preds = pipeline.predict(X) assert len(preds) == 1000 diff --git a/evalml/tests/pipeline_tests/test_time_series_pipeline.py b/evalml/tests/pipeline_tests/test_time_series_pipeline.py index 34f1b34549..a34b62b701 100644 --- a/evalml/tests/pipeline_tests/test_time_series_pipeline.py +++ b/evalml/tests/pipeline_tests/test_time_series_pipeline.py @@ -11,49 +11,102 @@ from evalml.pipelines import ( TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, - TimeSeriesRegressionPipeline + TimeSeriesRegressionPipeline, ) from evalml.preprocessing.utils import is_classification from evalml.problem_types import ProblemTypes -@pytest.mark.parametrize("pipeline_class,estimator", [(TimeSeriesRegressionPipeline, "Linear Regressor"), - (TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier"), - (TimeSeriesMulticlassClassificationPipeline, "Logistic Regression Classifier")]) -@pytest.mark.parametrize("components", [["One Hot Encoder"], - ["Delayed Feature Transformer", "One Hot Encoder"]]) +@pytest.mark.parametrize( + "pipeline_class,estimator", + [ + (TimeSeriesRegressionPipeline, "Linear Regressor"), + (TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier"), + (TimeSeriesMulticlassClassificationPipeline, "Logistic Regression Classifier"), + ], +) +@pytest.mark.parametrize( + "components", + [["One Hot Encoder"], ["Delayed Feature Transformer", "One Hot Encoder"]], +) def test_time_series_pipeline_init(pipeline_class, estimator, components): component_graph = components + [estimator] if "Delayed Feature Transformer" not in components: - pl = pipeline_class(component_graph=component_graph, - parameters={'pipeline': {"date_index": None, "gap": 3, "max_delay": 5}}) + pl = pipeline_class( + component_graph=component_graph, + parameters={"pipeline": {"date_index": None, "gap": 3, "max_delay": 5}}, + ) assert "Delayed Feature Transformer" not in pl.parameters - assert pl.parameters['pipeline'] == {"gap": 3, "max_delay": 5, "date_index": None} + assert pl.parameters["pipeline"] == { + "gap": 3, + "max_delay": 5, + "date_index": None, + } else: - parameters = {"Delayed Feature Transformer": {"date_index": None, "gap": 3, "max_delay": 5}, - "pipeline": {"date_index": None, "gap": 3, "max_delay": 5}} + parameters = { + "Delayed Feature Transformer": { + "date_index": None, + "gap": 3, + "max_delay": 5, + }, + "pipeline": {"date_index": None, "gap": 3, "max_delay": 5}, + } pl = pipeline_class(component_graph=component_graph, parameters=parameters) - assert pl.parameters['Delayed Feature Transformer'] == {"date_index": None, "gap": 3, "max_delay": 5, - "delay_features": True, "delay_target": True} - assert pl.parameters['pipeline'] == {"gap": 3, "max_delay": 5, "date_index": None} - - assert pipeline_class(component_graph=component_graph, parameters=pl.parameters) == pl - - with pytest.raises(ValueError, match="date_index, gap, and max_delay parameters cannot be omitted from the parameters dict"): + assert pl.parameters["Delayed Feature Transformer"] == { + "date_index": None, + "gap": 3, + "max_delay": 5, + "delay_features": True, + "delay_target": True, + } + assert pl.parameters["pipeline"] == { + "gap": 3, + "max_delay": 5, + "date_index": None, + } + + assert ( + pipeline_class(component_graph=component_graph, parameters=pl.parameters) == pl + ) + + with pytest.raises( + ValueError, + match="date_index, gap, and max_delay parameters cannot be omitted from the parameters dict", + ): pipeline_class(component_graph, {}) @pytest.mark.parametrize("only_use_y", [True, False]) @pytest.mark.parametrize("include_delayed_features", [True, False]) -@pytest.mark.parametrize("gap,max_delay", [(0, 0), (1, 0), (0, 2), (1, 2), (2, 2), (7, 3), (2, 4)]) -@pytest.mark.parametrize("pipeline_class,estimator_name", [(TimeSeriesRegressionPipeline, "Random Forest Regressor"), - (TimeSeriesBinaryClassificationPipeline, "Random Forest Classifier"), - (TimeSeriesMulticlassClassificationPipeline, "Random Forest Classifier")]) +@pytest.mark.parametrize( + "gap,max_delay", [(0, 0), (1, 0), (0, 2), (1, 2), (2, 2), (7, 3), (2, 4)] +) +@pytest.mark.parametrize( + "pipeline_class,estimator_name", + [ + (TimeSeriesRegressionPipeline, "Random Forest Regressor"), + (TimeSeriesBinaryClassificationPipeline, "Random Forest Classifier"), + (TimeSeriesMulticlassClassificationPipeline, "Random Forest Classifier"), + ], +) @patch("evalml.pipelines.components.RandomForestRegressor.fit") @patch("evalml.pipelines.components.RandomForestClassifier.fit") -@patch("evalml.pipelines.TimeSeriesClassificationPipeline._encode_targets", side_effect=lambda y: y) -def test_fit_drop_nans_before_estimator(mock_encode_targets, mock_classifier_fit, mock_regressor_fit, pipeline_class, - estimator_name, gap, max_delay, include_delayed_features, only_use_y, ts_data): +@patch( + "evalml.pipelines.TimeSeriesClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) +def test_fit_drop_nans_before_estimator( + mock_encode_targets, + mock_classifier_fit, + mock_regressor_fit, + pipeline_class, + estimator_name, + gap, + max_delay, + include_delayed_features, + only_use_y, + ts_data, +): if only_use_y and (not include_delayed_features or (max_delay == 0 and gap == 0)): pytest.skip("This would result in an empty feature dataframe.") @@ -67,11 +120,19 @@ def test_fit_drop_nans_before_estimator(mock_encode_targets, mock_classifier_fit train_index = pd.date_range(f"2020-10-01", f"2020-10-{31-gap}") expected_target = np.arange(1 + gap, 32) - pl = pipeline_class(component_graph=["Delayed Feature Transformer", estimator_name], - parameters={"Delayed Feature Transformer": {"date_index": None, "gap": gap, "max_delay": max_delay, - "delay_features": include_delayed_features, - "delay_target": include_delayed_features}, - "pipeline": {"date_index": None, "gap": gap, "max_delay": max_delay}}) + pl = pipeline_class( + component_graph=["Delayed Feature Transformer", estimator_name], + parameters={ + "Delayed Feature Transformer": { + "date_index": None, + "gap": gap, + "max_delay": max_delay, + "delay_features": include_delayed_features, + "delay_target": include_delayed_features, + }, + "pipeline": {"date_index": None, "gap": gap, "max_delay": max_delay}, + }, + ) if only_use_y: pl.fit(None, y) @@ -79,9 +140,15 @@ def test_fit_drop_nans_before_estimator(mock_encode_targets, mock_classifier_fit pl.fit(X, y) if isinstance(pl, TimeSeriesRegressionPipeline): - df_passed_to_estimator, target_passed_to_estimator = mock_regressor_fit.call_args[0] + ( + df_passed_to_estimator, + target_passed_to_estimator, + ) = mock_regressor_fit.call_args[0] else: - df_passed_to_estimator, target_passed_to_estimator = mock_classifier_fit.call_args[0] + ( + df_passed_to_estimator, + target_passed_to_estimator, + ) = mock_classifier_fit.call_args[0] # NaNs introduced by shifting are dropped assert not df_passed_to_estimator.isna().any(axis=1).any() @@ -94,31 +161,69 @@ def test_fit_drop_nans_before_estimator(mock_encode_targets, mock_classifier_fit @pytest.mark.parametrize("only_use_y", [True, False]) @pytest.mark.parametrize("include_delayed_features", [True, False]) -@pytest.mark.parametrize("gap,max_delay,date_index", [(0, 0, None), (1, 0, None), (0, 2, None), (1, 1, None), - (1, 2, None), (2, 2, None), (7, 3, None), (2, 4, None)]) -@pytest.mark.parametrize("pipeline_class,estimator_name", [(TimeSeriesRegressionPipeline, "Random Forest Regressor"), - (TimeSeriesBinaryClassificationPipeline, "Random Forest Classifier"), - (TimeSeriesMulticlassClassificationPipeline, "Random Forest Classifier")]) +@pytest.mark.parametrize( + "gap,max_delay,date_index", + [ + (0, 0, None), + (1, 0, None), + (0, 2, None), + (1, 1, None), + (1, 2, None), + (2, 2, None), + (7, 3, None), + (2, 4, None), + ], +) +@pytest.mark.parametrize( + "pipeline_class,estimator_name", + [ + (TimeSeriesRegressionPipeline, "Random Forest Regressor"), + (TimeSeriesBinaryClassificationPipeline, "Random Forest Classifier"), + (TimeSeriesMulticlassClassificationPipeline, "Random Forest Classifier"), + ], +) @patch("evalml.pipelines.components.RandomForestClassifier.fit") @patch("evalml.pipelines.components.RandomForestClassifier.predict") @patch("evalml.pipelines.components.RandomForestRegressor.fit") @patch("evalml.pipelines.components.RandomForestRegressor.predict") -@patch("evalml.pipelines.TimeSeriesClassificationPipeline._decode_targets", side_effect=lambda y: y) -def test_predict_pad_nans(mock_decode_targets, - mock_regressor_predict, mock_regressor_fit, mock_classifier_predict, mock_classifier_fit, - pipeline_class, - estimator_name, gap, max_delay, date_index, include_delayed_features, only_use_y, ts_data): +@patch( + "evalml.pipelines.TimeSeriesClassificationPipeline._decode_targets", + side_effect=lambda y: y, +) +def test_predict_pad_nans( + mock_decode_targets, + mock_regressor_predict, + mock_regressor_fit, + mock_classifier_predict, + mock_classifier_fit, + pipeline_class, + estimator_name, + gap, + max_delay, + date_index, + include_delayed_features, + only_use_y, + ts_data, +): if only_use_y and (not include_delayed_features or (max_delay == 0 and gap == 0)): pytest.skip("This would result in an empty feature dataframe.") X, y = ts_data - pl = pipeline_class(component_graph=["Delayed Feature Transformer", estimator_name], - parameters={"Delayed Feature Transformer": {"date_index": None, "gap": gap, "max_delay": max_delay, - "delay_features": include_delayed_features, - "delay_target": include_delayed_features}, - "pipeline": {"date_index": None, "gap": gap, "max_delay": max_delay}}) + pl = pipeline_class( + component_graph=["Delayed Feature Transformer", estimator_name], + parameters={ + "Delayed Feature Transformer": { + "date_index": None, + "gap": gap, + "max_delay": max_delay, + "delay_features": include_delayed_features, + "delay_target": include_delayed_features, + }, + "pipeline": {"date_index": None, "gap": gap, "max_delay": max_delay}, + }, + ) def mock_predict(df, y=None): return pd.Series(range(200, 200 + df.shape[0])) @@ -144,23 +249,54 @@ def mock_predict(df, y=None): @pytest.mark.parametrize("only_use_y", [True, False]) @pytest.mark.parametrize("include_delayed_features", [True, False]) -@pytest.mark.parametrize("gap,max_delay,date_index", [(0, 0, None), (1, 0, None), (0, 2, None), (1, 1, None), (1, 2, None), - (2, 2, None), (7, 3, None), (2, 4, None)]) -@pytest.mark.parametrize("pipeline_class,estimator_name", [(TimeSeriesRegressionPipeline, "Random Forest Regressor"), - (TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier"), - (TimeSeriesMulticlassClassificationPipeline, "Logistic Regression Classifier")]) +@pytest.mark.parametrize( + "gap,max_delay,date_index", + [ + (0, 0, None), + (1, 0, None), + (0, 2, None), + (1, 1, None), + (1, 2, None), + (2, 2, None), + (7, 3, None), + (2, 4, None), + ], +) +@pytest.mark.parametrize( + "pipeline_class,estimator_name", + [ + (TimeSeriesRegressionPipeline, "Random Forest Regressor"), + (TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier"), + (TimeSeriesMulticlassClassificationPipeline, "Logistic Regression Classifier"), + ], +) @patch("evalml.pipelines.components.RandomForestRegressor.fit") @patch("evalml.pipelines.components.RandomForestRegressor.predict") @patch("evalml.pipelines.components.LogisticRegressionClassifier.fit") @patch("evalml.pipelines.components.LogisticRegressionClassifier.predict") -@patch("evalml.pipelines.TimeSeriesClassificationPipeline._encode_targets", side_effect=lambda y: y) +@patch( + "evalml.pipelines.TimeSeriesClassificationPipeline._encode_targets", + side_effect=lambda y: y, +) @patch("evalml.pipelines.PipelineBase._score_all_objectives") @patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline._score_all_objectives") -def test_score_drops_nans(mock_binary_score, mock_score, mock_encode_targets, - mock_classifier_predict, mock_classifier_fit, - mock_regressor_predict, mock_regressor_fit, - pipeline_class, - estimator_name, gap, max_delay, date_index, include_delayed_features, only_use_y, ts_data): +def test_score_drops_nans( + mock_binary_score, + mock_score, + mock_encode_targets, + mock_classifier_predict, + mock_classifier_fit, + mock_regressor_predict, + mock_regressor_fit, + pipeline_class, + estimator_name, + gap, + max_delay, + date_index, + include_delayed_features, + only_use_y, + ts_data, +): if pipeline_class == TimeSeriesBinaryClassificationPipeline: mock_score = mock_binary_score if only_use_y and (not include_delayed_features or (max_delay == 0 and gap == 0)): @@ -175,11 +311,19 @@ def test_score_drops_nans(mock_binary_score, mock_score, mock_encode_targets, expected_target = np.arange(1 + gap, 32) target_index = pd.date_range(f"2020-10-01", f"2020-10-{31-gap}") - pl = pipeline_class(component_graph=["Delayed Feature Transformer", estimator_name], - parameters={"Delayed Feature Transformer": {"date_index": None, "gap": gap, "max_delay": max_delay, - "delay_features": include_delayed_features, - "delay_target": include_delayed_features}, - "pipeline": {"date_index": None, "gap": gap, "max_delay": max_delay}}) + pl = pipeline_class( + component_graph=["Delayed Feature Transformer", estimator_name], + parameters={ + "Delayed Feature Transformer": { + "date_index": None, + "gap": gap, + "max_delay": max_delay, + "delay_features": include_delayed_features, + "delay_target": include_delayed_features, + }, + "pipeline": {"date_index": None, "gap": gap, "max_delay": max_delay}, + }, + ) def mock_predict(X, y=None): return pd.Series(range(200, 200 + X.shape[0])) @@ -191,7 +335,7 @@ def mock_predict(X, y=None): if only_use_y: pl.fit(None, y) - pl.score(X=None, y=y, objectives=['MCC Binary']) + pl.score(X=None, y=y, objectives=["MCC Binary"]) else: pl.fit(X, y) pl.score(X, y, objectives=["MCC Binary"]) @@ -206,20 +350,32 @@ def mock_predict(X, y=None): np.testing.assert_equal(target.values, expected_target) -@pytest.mark.parametrize("pipeline_class", [TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline]) +@pytest.mark.parametrize( + "pipeline_class", + [ + TimeSeriesBinaryClassificationPipeline, + TimeSeriesMulticlassClassificationPipeline, + ], +) @patch("evalml.pipelines.LogisticRegressionClassifier.fit") @patch("evalml.pipelines.LogisticRegressionClassifier.predict_proba") @patch("evalml.pipelines.LogisticRegressionClassifier.predict") @patch("evalml.pipelines.TimeSeriesClassificationPipeline._score_all_objectives") @patch("evalml.pipelines.ClassificationPipeline._decode_targets") @patch("evalml.pipelines.ClassificationPipeline._encode_targets") -def test_classification_pipeline_encodes_targets(mock_encode, mock_decode, - mock_score, mock_predict, mock_predict_proba, - mock_fit, pipeline_class, X_y_binary): +def test_classification_pipeline_encodes_targets( + mock_encode, + mock_decode, + mock_score, + mock_predict, + mock_predict_proba, + mock_fit, + pipeline_class, + X_y_binary, +): X, y = X_y_binary y_series = pd.Series(y) - df = pd.DataFrame({"negative": y_series, - "positive": y_series}) + df = pd.DataFrame({"negative": y_series, "positive": y_series}) df.ww.init() mock_predict.return_value = ww.init_series(y_series) mock_predict_proba.return_value = df @@ -230,9 +386,20 @@ def test_classification_pipeline_encodes_targets(mock_encode, mock_decode, mock_encode.return_value = y_series mock_decode.return_value = y_encoded - pl = pipeline_class(component_graph=['Delayed Feature Transformer', 'Logistic Regression Classifier'], - parameters={"Delayed Feature Transformer": {"date_index": None, "gap": 0, "max_delay": 1}, - "pipeline": {"date_index": None, "gap": 0, "max_delay": 1}}) + pl = pipeline_class( + component_graph=[ + "Delayed Feature Transformer", + "Logistic Regression Classifier", + ], + parameters={ + "Delayed Feature Transformer": { + "date_index": None, + "gap": 0, + "max_delay": 1, + }, + "pipeline": {"date_index": None, "gap": 0, "max_delay": 1}, + }, + ) # Check fit encodes target pl.fit(X, y_encoded) @@ -253,30 +420,55 @@ def test_classification_pipeline_encodes_targets(mock_encode, mock_decode, # Check score encodes target mock_encode.reset_mock() - pl.score(X, y_encoded, objectives=['MCC Binary']) + pl.score(X, y_encoded, objectives=["MCC Binary"]) mock_encode.assert_called_once() -@pytest.mark.parametrize("pipeline_class,objectives", [(TimeSeriesBinaryClassificationPipeline, ["MCC Binary"]), - (TimeSeriesBinaryClassificationPipeline, ["Log Loss Binary"]), - (TimeSeriesBinaryClassificationPipeline, ["MCC Binary", "Log Loss Binary"]), - (TimeSeriesMulticlassClassificationPipeline, ["MCC Multiclass"]), - (TimeSeriesMulticlassClassificationPipeline, ["Log Loss Multiclass"]), - (TimeSeriesMulticlassClassificationPipeline, ["MCC Multiclass", "Log Loss Multiclass"]), - (TimeSeriesRegressionPipeline, ['R2']), - (TimeSeriesRegressionPipeline, ['R2', "Mean Absolute Percentage Error"])]) +@pytest.mark.parametrize( + "pipeline_class,objectives", + [ + (TimeSeriesBinaryClassificationPipeline, ["MCC Binary"]), + (TimeSeriesBinaryClassificationPipeline, ["Log Loss Binary"]), + (TimeSeriesBinaryClassificationPipeline, ["MCC Binary", "Log Loss Binary"]), + (TimeSeriesMulticlassClassificationPipeline, ["MCC Multiclass"]), + (TimeSeriesMulticlassClassificationPipeline, ["Log Loss Multiclass"]), + ( + TimeSeriesMulticlassClassificationPipeline, + ["MCC Multiclass", "Log Loss Multiclass"], + ), + (TimeSeriesRegressionPipeline, ["R2"]), + (TimeSeriesRegressionPipeline, ["R2", "Mean Absolute Percentage Error"]), + ], +) @pytest.mark.parametrize("data_type", ["pd", "ww"]) -def test_score_works(pipeline_class, objectives, data_type, X_y_binary, X_y_multi, X_y_regression, make_data_type): - - preprocessing = ['Delayed Feature Transformer'] +def test_score_works( + pipeline_class, + objectives, + data_type, + X_y_binary, + X_y_multi, + X_y_regression, + make_data_type, +): + + preprocessing = ["Delayed Feature Transformer"] if pipeline_class == TimeSeriesRegressionPipeline: - components = preprocessing + ['Random Forest Regressor'] + components = preprocessing + ["Random Forest Regressor"] else: components = preprocessing + ["Logistic Regression Classifier"] - pl = pipeline_class(component_graph=components, - parameters={"pipeline": {"date_index": None, "gap": 1, "max_delay": 2, "delay_features": False}, - components[-1]: {'n_jobs': 1}}) + pl = pipeline_class( + component_graph=components, + parameters={ + "pipeline": { + "date_index": None, + "gap": 1, + "max_delay": 2, + "delay_features": False, + }, + components[-1]: {"n_jobs": 1}, + }, + ) if pl.problem_type == ProblemTypes.TIME_SERIES_BINARY: X, y = X_y_binary y = pd.Series(y).map(lambda label: "good" if label == 1 else "bad") @@ -301,18 +493,30 @@ def test_score_works(pipeline_class, objectives, data_type, X_y_binary, X_y_mult pl.score(X, y, objectives) -@patch('evalml.pipelines.TimeSeriesClassificationPipeline._decode_targets') -@patch('evalml.objectives.BinaryClassificationObjective.decision_function') -@patch('evalml.pipelines.components.Estimator.predict_proba', return_value=pd.DataFrame({0: [1.]})) -@patch('evalml.pipelines.components.Estimator.predict', return_value=pd.Series([1.])) -def test_binary_classification_predictions_thresholded_properly(mock_predict, mock_predict_proba, - mock_obj_decision, mock_decode, - X_y_binary, dummy_ts_binary_pipeline_class): +@patch("evalml.pipelines.TimeSeriesClassificationPipeline._decode_targets") +@patch("evalml.objectives.BinaryClassificationObjective.decision_function") +@patch( + "evalml.pipelines.components.Estimator.predict_proba", + return_value=pd.DataFrame({0: [1.0]}), +) +@patch("evalml.pipelines.components.Estimator.predict", return_value=pd.Series([1.0])) +def test_binary_classification_predictions_thresholded_properly( + mock_predict, + mock_predict_proba, + mock_obj_decision, + mock_decode, + X_y_binary, + dummy_ts_binary_pipeline_class, +): mock_objs = [mock_decode, mock_predict] mock_decode.return_value = pd.Series([0, 1]) X, y = X_y_binary - binary_pipeline = dummy_ts_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + binary_pipeline = dummy_ts_binary_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) # test no objective passed and no custom threshold uses underlying estimator's predict method binary_pipeline.fit(X, y) binary_pipeline.predict(X, y) @@ -321,7 +525,7 @@ def test_binary_classification_predictions_thresholded_properly(mock_predict, mo mock_obj.reset_mock() # test objective passed but no custom threshold uses underlying estimator's predict method - binary_pipeline.predict(X, y, 'precision') + binary_pipeline.predict(X, y, "precision") for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() @@ -349,8 +553,8 @@ def test_binary_classification_predictions_thresholded_properly(mock_predict, mo # test custom threshold set and objective passed binary_pipeline.threshold = 0.6 - mock_obj_decision.return_value = pd.Series([1.]) - binary_pipeline.predict(X, y, 'precision') + mock_obj_decision.return_value = pd.Series([1.0]) + binary_pipeline.predict(X, y, "precision") for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() @@ -358,35 +562,68 @@ def test_binary_classification_predictions_thresholded_properly(mock_predict, mo mock_obj_decision.assert_called() -@patch('evalml.pipelines.PipelineBase.compute_estimator_features') -def test_binary_predict_pipeline_objective_mismatch(mock_transform, X_y_binary, dummy_ts_binary_pipeline_class): +@patch("evalml.pipelines.PipelineBase.compute_estimator_features") +def test_binary_predict_pipeline_objective_mismatch( + mock_transform, X_y_binary, dummy_ts_binary_pipeline_class +): X, y = X_y_binary - binary_pipeline = dummy_ts_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + binary_pipeline = dummy_ts_binary_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) binary_pipeline.fit(X, y) - with pytest.raises(ValueError, match="Objective Precision Micro is not defined for time series binary classification."): + with pytest.raises( + ValueError, + match="Objective Precision Micro is not defined for time series binary classification.", + ): binary_pipeline.predict(X, y, "precision micro") mock_transform.assert_called() -@pytest.mark.parametrize("problem_type", [ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION]) -def test_time_series_pipeline_not_fitted_error(problem_type, X_y_binary, X_y_multi, X_y_regression, - time_series_binary_classification_pipeline_class, - time_series_multiclass_classification_pipeline_class, - time_series_regression_pipeline_class): +@pytest.mark.parametrize( + "problem_type", + [ + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_REGRESSION, + ], +) +def test_time_series_pipeline_not_fitted_error( + problem_type, + X_y_binary, + X_y_multi, + X_y_regression, + time_series_binary_classification_pipeline_class, + time_series_multiclass_classification_pipeline_class, + time_series_regression_pipeline_class, +): if problem_type == ProblemTypes.TIME_SERIES_BINARY: X, y = X_y_binary - clf = time_series_binary_classification_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + clf = time_series_binary_classification_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) elif problem_type == ProblemTypes.TIME_SERIES_MULTICLASS: X, y = X_y_multi - clf = time_series_multiclass_classification_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + clf = time_series_multiclass_classification_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) elif problem_type == ProblemTypes.TIME_SERIES_REGRESSION: X, y = X_y_regression - clf = time_series_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + clf = time_series_regression_pipeline_class( + parameters={ + "Linear Regressor": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) with pytest.raises(PipelineNotYetFittedError): clf.predict(X) @@ -400,26 +637,28 @@ def test_time_series_pipeline_not_fitted_error(problem_type, X_y_binary, X_y_mul clf.fit(X, y) if is_classification(problem_type): - to_patch = 'evalml.pipelines.TimeSeriesClassificationPipeline._predict' + to_patch = "evalml.pipelines.TimeSeriesClassificationPipeline._predict" if problem_type == ProblemTypes.TIME_SERIES_BINARY: - to_patch = 'evalml.pipelines.TimeSeriesBinaryClassificationPipeline._predict' + to_patch = ( + "evalml.pipelines.TimeSeriesBinaryClassificationPipeline._predict" + ) with patch(to_patch) as mock_predict: clf.predict(X, y) mock_predict.assert_called() _, kwargs = mock_predict.call_args - assert kwargs['objective'] is None + assert kwargs["objective"] is None mock_predict.reset_mock() - clf.predict(X, y, 'Log Loss Binary') + clf.predict(X, y, "Log Loss Binary") mock_predict.assert_called() _, kwargs = mock_predict.call_args - assert kwargs['objective'] is not None + assert kwargs["objective"] is not None mock_predict.reset_mock() - clf.predict(X, y, objective='Log Loss Binary') + clf.predict(X, y, objective="Log Loss Binary") mock_predict.assert_called() _, kwargs = mock_predict.call_args - assert kwargs['objective'] is not None + assert kwargs["objective"] is not None clf.predict_proba(X, y) else: @@ -427,14 +666,20 @@ def test_time_series_pipeline_not_fitted_error(problem_type, X_y_binary, X_y_mul clf.feature_importance -def test_ts_binary_pipeline_target_thresholding(make_data_type, time_series_binary_classification_pipeline_class, X_y_binary): +def test_ts_binary_pipeline_target_thresholding( + make_data_type, time_series_binary_classification_pipeline_class, X_y_binary +): X, y = X_y_binary - X = make_data_type('ww', X) - y = make_data_type('ww', y) + X = make_data_type("ww", X) + y = make_data_type("ww", y) objective = get_objective("F1", return_instance=True) - binary_pipeline = time_series_binary_classification_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + binary_pipeline = time_series_binary_classification_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) binary_pipeline.fit(X, y) assert binary_pipeline.threshold is None pred_proba = binary_pipeline.predict_proba(X, y).iloc[:, 1] @@ -442,14 +687,20 @@ def test_ts_binary_pipeline_target_thresholding(make_data_type, time_series_bina assert binary_pipeline.threshold is not None -@patch('evalml.objectives.FraudCost.decision_function') -def test_binary_predict_pipeline_use_objective(mock_decision_function, X_y_binary, time_series_binary_classification_pipeline_class): +@patch("evalml.objectives.FraudCost.decision_function") +def test_binary_predict_pipeline_use_objective( + mock_decision_function, X_y_binary, time_series_binary_classification_pipeline_class +): X, y = X_y_binary - binary_pipeline = time_series_binary_classification_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}, - "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}}) + binary_pipeline = time_series_binary_classification_pipeline_class( + parameters={ + "Logistic Regression Classifier": {"n_jobs": 1}, + "pipeline": {"gap": 0, "max_delay": 0, "date_index": None}, + } + ) mock_decision_function.return_value = pd.Series([0] * 98) binary_pipeline.threshold = 0.7 binary_pipeline.fit(X, y) fraud_cost = FraudCost(amount_col=0) - binary_pipeline.score(X, y, ['precision', 'auc', fraud_cost]) + binary_pipeline.score(X, y, ["precision", "auc", fraud_cost]) mock_decision_function.assert_called() diff --git a/evalml/tests/preprocessing_tests/test_balanced_classification_sampler.py b/evalml/tests/preprocessing_tests/test_balanced_classification_sampler.py index 600a69ece0..32ca40ec55 100644 --- a/evalml/tests/preprocessing_tests/test_balanced_classification_sampler.py +++ b/evalml/tests/preprocessing_tests/test_balanced_classification_sampler.py @@ -6,11 +6,16 @@ from evalml.preprocessing.data_splitters import BalancedClassificationSampler -@pytest.mark.parametrize("ratio,samples,percentage,seed", - [(1, 1, 0.2, 1), - (0.3, 101, 0.5, 100)]) +@pytest.mark.parametrize( + "ratio,samples,percentage,seed", [(1, 1, 0.2, 1), (0.3, 101, 0.5, 100)] +) def test_balanced_classification_init(ratio, samples, percentage, seed): - bcs = BalancedClassificationSampler(sampling_ratio=ratio, min_samples=samples, min_percentage=percentage, random_seed=seed) + bcs = BalancedClassificationSampler( + sampling_ratio=ratio, + min_samples=samples, + min_percentage=percentage, + random_seed=seed, + ) assert bcs.sampling_ratio == ratio assert bcs.min_samples == samples assert bcs.min_percentage == percentage @@ -91,9 +96,15 @@ def test_classification_imbalanced_sampling_ratio(num_classes, sampling_ratio): pd.testing.assert_series_equal(y, y2) else: # remove some samples - assert len(X2) == {2: (250 + int(250 / sampling_ratio)), 3: (400 + int(200 / sampling_ratio))}[num_classes] + assert len(X2) == { + 2: (250 + int(250 / sampling_ratio)), + 3: (400 + int(200 / sampling_ratio)), + }[num_classes] assert len(y2) == len(X2) - assert y2.value_counts().values[0] == int(1 / sampling_ratio) * {2: 250, 3: 200}[num_classes] + assert ( + y2.value_counts().values[0] + == int(1 / sampling_ratio) * {2: 250, 3: 200}[num_classes] + ) @pytest.mark.parametrize("min_samples", [10, 50, 100, 200, 500]) @@ -148,7 +159,9 @@ def test_classification_imbalanced_min_percentage(num_classes, min_percentage): def test_classification_imbalanced_severe_imbalance_binary(min_samples, min_percentage): X = pd.DataFrame({"a": [i for i in range(1000)]}) y = pd.Series([0] * 850 + [1] * 150) # minority class is 15% of total distribution - bcs = BalancedClassificationSampler(sampling_ratio=0.5, min_samples=min_samples, min_percentage=min_percentage) + bcs = BalancedClassificationSampler( + sampling_ratio=0.5, min_samples=min_samples, min_percentage=min_percentage + ) indices = bcs.fit_resample(X, y) X2 = X.loc[indices] y2 = y.loc[indices] @@ -165,8 +178,12 @@ def test_classification_imbalanced_severe_imbalance_binary(min_samples, min_perc @pytest.mark.parametrize("min_samples", [10, 50, 100, 200, 500]) def test_classification_imbalanced_normal_imbalance_binary(min_samples, sampling_ratio): X = pd.DataFrame({"a": [i for i in range(1000)]}) - y = pd.Series([0] * 850 + [1] * 150) # minority class is 15% of total distribution, never counts as severe imbalance - bcs = BalancedClassificationSampler(sampling_ratio=sampling_ratio, min_samples=min_samples) + y = pd.Series( + [0] * 850 + [1] * 150 + ) # minority class is 15% of total distribution, never counts as severe imbalance + bcs = BalancedClassificationSampler( + sampling_ratio=sampling_ratio, min_samples=min_samples + ) indices = bcs.fit_resample(X, y) X2 = X.loc[indices] y2 = y.loc[indices] @@ -176,19 +193,27 @@ def test_classification_imbalanced_normal_imbalance_binary(min_samples, sampling else: # rebalance according to the ratio and min_samples assert len(X2) == 150 + max(min_samples, int(150 / sampling_ratio)) - assert y2.value_counts().values[0] == max(min_samples, int(150 / sampling_ratio)) + assert y2.value_counts().values[0] == max( + min_samples, int(150 / sampling_ratio) + ) -@pytest.mark.parametrize("data_type", ['n', 's']) +@pytest.mark.parametrize("data_type", ["n", "s"]) @pytest.mark.parametrize("min_percentage", [0.01, 0.05, 0.2, 0.3]) @pytest.mark.parametrize("min_samples", [10, 50, 100, 200, 500]) -def test_classification_imbalanced_severe_imbalance_multiclass(data_type, min_samples, min_percentage): +def test_classification_imbalanced_severe_imbalance_multiclass( + data_type, min_samples, min_percentage +): X = pd.DataFrame({"a": [i for i in range(1000)]}) - if data_type == 'n': - y = pd.Series([0] * 800 + [1] * 100 + [2] * 100) # minority class is 10% of total distribution + if data_type == "n": + y = pd.Series( + [0] * 800 + [1] * 100 + [2] * 100 + ) # minority class is 10% of total distribution else: y = pd.Series(["class_1"] * 800 + ["class_2"] * 100 + ["class_3"] * 100) - bcs = BalancedClassificationSampler(sampling_ratio=0.5, min_samples=min_samples, min_percentage=min_percentage) + bcs = BalancedClassificationSampler( + sampling_ratio=0.5, min_samples=min_samples, min_percentage=min_percentage + ) indices = bcs.fit_resample(X, y) X2 = X.loc[indices] y2 = y.loc[indices] @@ -201,16 +226,22 @@ def test_classification_imbalanced_severe_imbalance_multiclass(data_type, min_sa assert y2.value_counts().values[0] == max(min_samples, 2 * 100) -@pytest.mark.parametrize("data_type", ['n', 's']) +@pytest.mark.parametrize("data_type", ["n", "s"]) @pytest.mark.parametrize("sampling_ratio", [1, 0.5, 0.33, 0.25, 0.2, 0.1]) @pytest.mark.parametrize("min_samples", [10, 50, 100, 200, 500]) -def test_classification_imbalanced_normal_imbalance_multiclass(data_type, min_samples, sampling_ratio): +def test_classification_imbalanced_normal_imbalance_multiclass( + data_type, min_samples, sampling_ratio +): X = pd.DataFrame({"a": [i for i in range(1000)]}) - if data_type == 'n': - y = pd.Series([0] * 800 + [1] * 100 + [2] * 100) # minority class is 10% of total distribution + if data_type == "n": + y = pd.Series( + [0] * 800 + [1] * 100 + [2] * 100 + ) # minority class is 10% of total distribution else: y = pd.Series(["class_1"] * 800 + ["class_2"] * 100 + ["class_3"] * 100) - bcs = BalancedClassificationSampler(sampling_ratio=sampling_ratio, min_samples=min_samples) + bcs = BalancedClassificationSampler( + sampling_ratio=sampling_ratio, min_samples=min_samples + ) indices = bcs.fit_resample(X, y) X2 = X.loc[indices] y2 = y.loc[indices] @@ -220,7 +251,9 @@ def test_classification_imbalanced_normal_imbalance_multiclass(data_type, min_sa else: # rebalance according to the ratio and min_samples assert len(X2) == 200 + max(min_samples, int(100 / sampling_ratio)) - assert y2.value_counts().values[0] == max(min_samples, int(100 / sampling_ratio)) + assert y2.value_counts().values[0] == max( + min_samples, int(100 / sampling_ratio) + ) @pytest.mark.parametrize("sampling_ratio", [1, 0.5, 0.33, 0.25, 0.2, 0.1]) @@ -228,8 +261,12 @@ def test_classification_imbalanced_normal_imbalance_multiclass(data_type, min_sa def test_classification_imbalanced_random_seed(random_seed, sampling_ratio): X = pd.DataFrame({"a": [i for i in range(1000)]}) y = pd.Series([0] * 800 + [1] * 200) - bcs1 = BalancedClassificationSampler(sampling_ratio=sampling_ratio, random_seed=random_seed) - bcs2 = BalancedClassificationSampler(sampling_ratio=sampling_ratio, random_seed=random_seed) + bcs1 = BalancedClassificationSampler( + sampling_ratio=sampling_ratio, random_seed=random_seed + ) + bcs2 = BalancedClassificationSampler( + sampling_ratio=sampling_ratio, random_seed=random_seed + ) indices1 = bcs1.fit_resample(X, y) X1 = X.loc[indices1] y1 = y.loc[indices1] @@ -247,12 +284,16 @@ def test_classification_imbalanced_random_seed(random_seed, sampling_ratio): pd.testing.assert_series_equal(y1, y2) -@pytest.mark.parametrize("index", [[f'hello_{i}' for i in range(1000)], - random.shuffle([i + 0.5 for i in range(1000)]), - pd.MultiIndex.from_arrays([ - [f"index_{i}" for i in range(1000)], - [i for i in range(1000)] - ])]) +@pytest.mark.parametrize( + "index", + [ + [f"hello_{i}" for i in range(1000)], + random.shuffle([i + 0.5 for i in range(1000)]), + pd.MultiIndex.from_arrays( + [[f"index_{i}" for i in range(1000)], [i for i in range(1000)]] + ), + ], +) def test_classification_imbalanced_custom_indices(index): X = pd.DataFrame({"a": [i for i in range(1000)]}, index=index) y = pd.Series([0] * 900 + [1] * 100, index=index) @@ -311,7 +352,7 @@ def test_classification_imbalanced_multiple_multiclass(): assert all(y2.value_counts().values == [800, 800, 200]) assert y2.value_counts()[2] == 200 - bcs = BalancedClassificationSampler(sampling_ratio=.3333) + bcs = BalancedClassificationSampler(sampling_ratio=0.3333) indices = bcs.fit_resample(X, y) X2 = X.loc[indices] y2 = y.loc[indices] @@ -321,7 +362,7 @@ def test_classification_imbalanced_multiple_multiclass(): assert y2.value_counts()[2] == 200 -@pytest.mark.parametrize("data_type", ['li', 'np', 'pd', 'ww']) +@pytest.mark.parametrize("data_type", ["li", "np", "pd", "ww"]) def test_classification_imbalanced_data_type(data_type, make_data_type): X = pd.DataFrame({"a": [i for i in range(1000)]}) y = pd.Series([0] * 900 + [1] * 100) @@ -331,21 +372,29 @@ def test_classification_imbalanced_data_type(data_type, make_data_type): bcs = BalancedClassificationSampler() indices = bcs.fit_resample(X, y) assert len(indices) == 500 - if data_type in ['pd', 'np']: + if data_type in ["pd", "np"]: y2 = y.loc[indices] assert all(y2.value_counts().values == [400, 100]) assert y2.value_counts()[1] == 100 def test_classification_data_frame_dtypes(): - X = pd.DataFrame({ - "integers": [i for i in range(1000)], - "strings": [f"string_{i % 3}" for i in range(1000)], - "text": [f"this should be text data because {i} think it's a long string. Let's hope it behaves in that way" for i in range(1000)], - "float": [i / 10000 for i in range(1000)], - "bool": [bool(i % 2) for i in range(1000)], - "datetime": [random.choice([2012 / 1 / 2, 2012 / 2 / 1, 2012 / 4 / 2]) for i in range(1000)] - }) + X = pd.DataFrame( + { + "integers": [i for i in range(1000)], + "strings": [f"string_{i % 3}" for i in range(1000)], + "text": [ + f"this should be text data because {i} think it's a long string. Let's hope it behaves in that way" + for i in range(1000) + ], + "float": [i / 10000 for i in range(1000)], + "bool": [bool(i % 2) for i in range(1000)], + "datetime": [ + random.choice([2012 / 1 / 2, 2012 / 2 / 1, 2012 / 4 / 2]) + for i in range(1000) + ], + } + ) y = pd.Series([0] * 900 + [1] * 100) bcs = BalancedClassificationSampler() indices = bcs.fit_resample(X, y) @@ -355,7 +404,7 @@ def test_classification_data_frame_dtypes(): assert all(y2.value_counts().values == [400, 100]) assert y2.value_counts()[1] == 100 - X['integers'][0] = None + X["integers"][0] = None indices = bcs.fit_resample(X, y) X2 = X.loc[indices] y2 = y.loc[indices] @@ -391,18 +440,25 @@ def test_dict_overrides_ratio(): X = pd.DataFrame({"a": [i for i in range(1000)]}) y = pd.Series([0] * 200 + [1] * 800) sampling_ratio_dict = {0: 200, 1: 800} - bcs = BalancedClassificationSampler(sampling_ratio=0.1, sampling_ratio_dict=sampling_ratio_dict) + bcs = BalancedClassificationSampler( + sampling_ratio=0.1, sampling_ratio_dict=sampling_ratio_dict + ) indices = bcs.fit_resample(X, y) y_new = y.iloc[indices] y_sampled_count = y_new.value_counts().to_dict() assert y_sampled_count == sampling_ratio_dict -@pytest.mark.parametrize("sampling_ratio_dict,expected", [({0: 200, 1: 700}, {0: 200, 1: 700}), - ({0: 100, 1: 100}, {0: 100, 1: 100}), - ({0: 200, 1: 800}, {0: 200, 1: 800}), - ({0: 100, 1: 805}, {0: 100, 1: 800}), - ({0: 200, 1: 805}, {0: 200, 1: 800})]) +@pytest.mark.parametrize( + "sampling_ratio_dict,expected", + [ + ({0: 200, 1: 700}, {0: 200, 1: 700}), + ({0: 100, 1: 100}, {0: 100, 1: 100}), + ({0: 200, 1: 800}, {0: 200, 1: 800}), + ({0: 100, 1: 805}, {0: 100, 1: 800}), + ({0: 200, 1: 805}, {0: 200, 1: 800}), + ], +) def test_sampler_ratio_dictionary_binary(sampling_ratio_dict, expected): X = pd.DataFrame({"a": [i for i in range(1000)]}) y = pd.Series([0] * 200 + [1] * 800) @@ -413,11 +469,16 @@ def test_sampler_ratio_dictionary_binary(sampling_ratio_dict, expected): assert y_sampled_count == expected -@pytest.mark.parametrize("sampling_ratio_dict,expected", [({0: 200, 1: 700, 2: 150}, {0: 200, 1: 700, 2: 150}), - ({0: 100, 1: 100, 2: 150}, {0: 100, 1: 100, 2: 150}), - ({0: 200, 1: 800, 2: 200}, {0: 200, 1: 800, 2: 200}), - ({0: 100, 1: 805, 2: 400}, {0: 100, 1: 800, 2: 200}), - ({0: 200, 1: 805, 2: 400}, {0: 200, 1: 800, 2: 200})]) +@pytest.mark.parametrize( + "sampling_ratio_dict,expected", + [ + ({0: 200, 1: 700, 2: 150}, {0: 200, 1: 700, 2: 150}), + ({0: 100, 1: 100, 2: 150}, {0: 100, 1: 100, 2: 150}), + ({0: 200, 1: 800, 2: 200}, {0: 200, 1: 800, 2: 200}), + ({0: 100, 1: 805, 2: 400}, {0: 100, 1: 800, 2: 200}), + ({0: 200, 1: 805, 2: 400}, {0: 200, 1: 800, 2: 200}), + ], +) def test_sampler_ratio_dictionary_multiclass(sampling_ratio_dict, expected): X = pd.DataFrame({"a": [i for i in range(1200)]}) y = pd.Series([0] * 200 + [1] * 800 + [2] * 200) diff --git a/evalml/tests/preprocessing_tests/test_drop_na_rows.py b/evalml/tests/preprocessing_tests/test_drop_na_rows.py index 4862bf3a02..2d76388475 100644 --- a/evalml/tests/preprocessing_tests/test_drop_na_rows.py +++ b/evalml/tests/preprocessing_tests/test_drop_na_rows.py @@ -10,7 +10,7 @@ def X_y_na(): y = pd.Series([1, 0, 1, np.nan]) X = pd.DataFrame() - X["a"] = ['a', 'b', 'c', 'd'] + X["a"] = ["a", "b", "c", "d"] X["b"] = [1, 2, 3, 0] X["c"] = [np.nan, 0, 0, np.nan] X["d"] = [0, 0, 0, 0] @@ -28,10 +28,9 @@ def test_drop_nan_target_rows(X_y_na): def test_with_numpy_input(X_y_na): _, y = X_y_na - X_arr = np.array([[1, 2, 3, 0], - [np.nan, 0, 0, 1], - [np.nan, 0, np.nan, 0], - [np.nan, 0, 0, 0]]) + X_arr = np.array( + [[1, 2, 3, 0], [np.nan, 0, 0, 1], [np.nan, 0, np.nan, 0], [np.nan, 0, 0, 0]] + ) y_arr = y.values X_t, y_t = drop_nan_target_rows(X_arr, y_arr) y_expected = pd.Series([1, 0, 1]) diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index c589122b96..c7fe8a9ff0 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -7,13 +7,15 @@ is_binary, is_multiclass, is_regression, - is_time_series + is_time_series, ) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) -@pytest.mark.parametrize("data_type", ['np', 'pd', 'ww']) -def test_split_data(problem_type, data_type, X_y_binary, X_y_multi, X_y_regression, make_data_type): +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) +def test_split_data( + problem_type, data_type, X_y_binary, X_y_multi, X_y_regression, make_data_type +): if is_binary(problem_type): X, y = X_y_binary if is_multiclass(problem_type): @@ -22,14 +24,19 @@ def test_split_data(problem_type, data_type, X_y_binary, X_y_multi, X_y_regressi X, y = X_y_regression problem_configuration = None if is_time_series(problem_type): - problem_configuration = {'gap': 1, 'max_delay': 7, "date_index": None} + problem_configuration = {"gap": 1, "max_delay": 7, "date_index": None} X = make_data_type(data_type, X) y = make_data_type(data_type, y) test_pct = 0.25 - X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_pct, problem_type=problem_type, - problem_configuration=problem_configuration) + X_train, X_test, y_train, y_test = split_data( + X, + y, + test_size=test_pct, + problem_type=problem_type, + problem_configuration=problem_configuration, + ) test_size = len(X) * test_pct train_size = len(X) - test_size assert len(X_train) == train_size diff --git a/evalml/tests/preprocessing_tests/test_training_validation_split.py b/evalml/tests/preprocessing_tests/test_training_validation_split.py index 0bebef8489..f2905d109e 100644 --- a/evalml/tests/preprocessing_tests/test_training_validation_split.py +++ b/evalml/tests/preprocessing_tests/test_training_validation_split.py @@ -10,8 +10,8 @@ def test_tvsplit_nsplits(): def test_tvsplit_default(): - X = pd.DataFrame({'col1': np.arange(0, 10)}) - y = pd.Series(np.arange(100, 110), name='target') + X = pd.DataFrame({"col1": np.arange(0, 10)}) + y = pd.Series(np.arange(100, 110), name="target") splitter = TrainingValidationSplit() splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 @@ -21,8 +21,8 @@ def test_tvsplit_default(): def test_tvsplit_size(): - X = pd.DataFrame({'col1': np.arange(0, 10)}) - y = pd.Series(np.arange(100, 110), name='target') + X = pd.DataFrame({"col1": np.arange(0, 10)}) + y = pd.Series(np.arange(100, 110), name="target") splitter = TrainingValidationSplit(test_size=0.2, train_size=0.3) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 @@ -37,8 +37,8 @@ def test_tvsplit_size(): def test_tvsplit_shuffle(): - X = pd.DataFrame({'col1': np.arange(0, 10)}) - y = pd.Series(np.arange(100, 110), name='target') + X = pd.DataFrame({"col1": np.arange(0, 10)}) + y = pd.Series(np.arange(100, 110), name="target") splitter = TrainingValidationSplit(shuffle=True, random_seed=0) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 @@ -47,9 +47,11 @@ def test_tvsplit_shuffle(): def test_tvsplit_stratify(): - X = pd.DataFrame({'col1': np.arange(0, 10)}) - y = pd.Series(np.arange(5).repeat(2), name='target') - splitter = TrainingValidationSplit(train_size=5, test_size=5, shuffle=True, stratify=y, random_seed=0) + X = pd.DataFrame({"col1": np.arange(0, 10)}) + y = pd.Series(np.arange(5).repeat(2), name="target") + splitter = TrainingValidationSplit( + train_size=5, test_size=5, shuffle=True, stratify=y, random_seed=0 + ) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [1, 4, 2, 8, 7]) @@ -59,8 +61,10 @@ def test_tvsplit_stratify(): @pytest.mark.parametrize("random_seed", [0, 11, 57, 99, 1000]) def test_tvsplit_always_within_bounds_with_custom_index(random_seed): N = 11000 - X = pd.DataFrame({'col1': np.arange(0, N)}, index=np.arange(20000, 20000 + N)) - splitter = TrainingValidationSplit(train_size=0.75, shuffle=True, random_seed=random_seed) + X = pd.DataFrame({"col1": np.arange(0, N)}, index=np.arange(20000, 20000 + N)) + splitter = TrainingValidationSplit( + train_size=0.75, shuffle=True, random_seed=random_seed + ) splits = list(splitter.split(X, y=None)) assert np.all(np.logical_and(splits[0][0] < N, splits[0][0] >= 0)) assert np.all(np.logical_and(splits[0][1] < N, splits[0][1] >= 0)) diff --git a/evalml/tests/problem_type_tests/test_problem_types.py b/evalml/tests/problem_type_tests/test_problem_types.py index 927e584ea3..080868a90d 100644 --- a/evalml/tests/problem_type_tests/test_problem_types.py +++ b/evalml/tests/problem_type_tests/test_problem_types.py @@ -10,27 +10,38 @@ is_classification, is_multiclass, is_regression, - is_time_series + is_time_series, ) @pytest.fixture def correct_problem_types(): # Unit tests expect this order - correct_problem_types = [ProblemTypes.REGRESSION, ProblemTypes.MULTICLASS, - ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_REGRESSION, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS] + correct_problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.MULTICLASS, + ProblemTypes.BINARY, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] yield correct_problem_types def test_handle_string(correct_problem_types): - problem_types = ['regression', ProblemTypes.MULTICLASS, 'binary', ProblemTypes.TIME_SERIES_REGRESSION, - 'time series binary', 'time series multiclass'] + problem_types = [ + "regression", + ProblemTypes.MULTICLASS, + "binary", + ProblemTypes.TIME_SERIES_REGRESSION, + "time series binary", + "time series multiclass", + ] for problem_type in zip(problem_types, correct_problem_types): assert handle_problem_types(problem_type[0]) == problem_type[1] - problem_type = 'fake' - error_msg = 'Problem type \'fake\' does not exist' + problem_type = "fake" + error_msg = "Problem type 'fake' does not exist" with pytest.raises(KeyError, match=error_msg): handle_problem_types(problem_type) == ProblemTypes.REGRESSION @@ -41,7 +52,7 @@ def test_handle_problem_types(correct_problem_types): def test_handle_incorrect_type(): - error_msg = '`handle_problem_types` was not passed a str or ProblemTypes object' + error_msg = "`handle_problem_types` was not passed a str or ProblemTypes object" with pytest.raises(ValueError, match=error_msg): handle_problem_types(5) @@ -66,8 +77,8 @@ def test_detect_problem_type_binary(): y_binary = pd.Series([1, 0, 1, 0, 0, 1]) y_bool = pd.Series([True, False, True, True, True]) y_float = pd.Series([1.0, 0.0, 1.0, 1.0, 0.0, 0.0]) - y_object = pd.Series(['yes', 'no', 'no', 'yes']) - y_categorical = pd.Series(['yes', 'no', 'no', 'yes'], dtype='category') + y_object = pd.Series(["yes", "no", "no", "yes"]) + y_categorical = pd.Series(["yes", "no", "no", "yes"], dtype="category") y_null = pd.Series([None, 0, 1, 1, 1]) assert detect_problem_type(y_binary) == ProblemTypes.BINARY @@ -80,7 +91,7 @@ def test_detect_problem_type_binary(): def test_detect_problem_type_multiclass(): y_multi = pd.Series([1, 2, 0, 2, 0, 0, 1]) - y_categorical = pd.Series(['yes', 'no', 'maybe', 'no'], dtype='category') + y_categorical = pd.Series(["yes", "no", "maybe", "no"], dtype="category") y_classes = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, np.nan] * 5) y_float = pd.Series([1.0, 2, 3]) y_obj = pd.Series(["y", "n", "m"]) @@ -103,8 +114,10 @@ def test_detect_problem_type_regression(): def test_numeric_extensions(): - y_Int64 = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype='Int64') - y_Int64_null = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, None], dtype='Int64') + y_Int64 = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype="Int64") + y_Int64_null = pd.Series( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, None], dtype="Int64" + ) assert detect_problem_type(y_Int64) == ProblemTypes.REGRESSION assert detect_problem_type(y_Int64_null) == ProblemTypes.REGRESSION @@ -139,22 +152,36 @@ def test_all_problem_types(): ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS + ProblemTypes.TIME_SERIES_MULTICLASS, ] assert ProblemTypes.all_problem_types == expected -@pytest.mark.parametrize('problem_type', ProblemTypes.all_problem_types) +@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_type_checks(problem_type): - assert is_regression(problem_type) == (problem_type in - [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]) - assert is_binary(problem_type) == (problem_type in - [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]) - assert is_multiclass(problem_type) == (problem_type in - [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]) - assert is_classification(problem_type) == (problem_type in - [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS]) - assert is_time_series(problem_type) == (problem_type in - [ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, - ProblemTypes.TIME_SERIES_REGRESSION]) + assert is_regression(problem_type) == ( + problem_type in [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + ) + assert is_binary(problem_type) == ( + problem_type in [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY] + ) + assert is_multiclass(problem_type) == ( + problem_type in [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS] + ) + assert is_classification(problem_type) == ( + problem_type + in [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] + ) + assert is_time_series(problem_type) == ( + problem_type + in [ + ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_REGRESSION, + ] + ) diff --git a/evalml/tests/test_all_test_dirs_included.py b/evalml/tests/test_all_test_dirs_included.py index 2ad1b6190d..d242fa01fa 100644 --- a/evalml/tests/test_all_test_dirs_included.py +++ b/evalml/tests/test_all_test_dirs_included.py @@ -6,6 +6,12 @@ def test_all_test_dirs_included(): all_modules = find_packages() test_dir = os.path.dirname(__file__) - all_test_dirs_with_init_files = [module for module in all_modules if "evalml.tests" in module] - all_test_dirs = [dirname for dirname, _, files in os.walk(test_dir) if "__pycache__" not in dirname and "test" in os.path.split(dirname)[1]] + all_test_dirs_with_init_files = [ + module for module in all_modules if "evalml.tests" in module + ] + all_test_dirs = [ + dirname + for dirname, _, files in os.walk(test_dir) + if "__pycache__" not in dirname and "test" in os.path.split(dirname)[1] + ] assert len(all_test_dirs) == len(all_test_dirs_with_init_files) diff --git a/evalml/tests/tuner_tests/test_grid_search_tuner.py b/evalml/tests/tuner_tests/test_grid_search_tuner.py index 7d7c875b98..61ee9a54ef 100644 --- a/evalml/tests/tuner_tests/test_grid_search_tuner.py +++ b/evalml/tests/tuner_tests/test_grid_search_tuner.py @@ -17,7 +17,10 @@ def test_grid_search_tuner_unique_values(dummy_pipeline_hyperparameters): assert len(generated_parameters) == 10 for i in range(10): assert generated_parameters[i].keys() == dummy_pipeline_hyperparameters.keys() - assert generated_parameters[i]['Mock Classifier'].keys() == dummy_pipeline_hyperparameters['Mock Classifier'].keys() + assert ( + generated_parameters[i]["Mock Classifier"].keys() + == dummy_pipeline_hyperparameters["Mock Classifier"].keys() + ) def test_grid_search_tuner_no_params(dummy_pipeline_hyperparameters_small): @@ -28,16 +31,17 @@ def test_grid_search_tuner_no_params(dummy_pipeline_hyperparameters_small): tuner.propose() -def test_grid_search_tuner_basic(dummy_pipeline_hyperparameters, - dummy_pipeline_hyperparameters_unicode): +def test_grid_search_tuner_basic( + dummy_pipeline_hyperparameters, dummy_pipeline_hyperparameters_unicode +): tuner = GridSearchTuner(dummy_pipeline_hyperparameters) proposed_params = tuner.propose() assert proposed_params == { - 'Mock Classifier': { - 'param a': 0, - 'param b': 0.0, - 'param c': 'option a', - 'param d': 'option a' + "Mock Classifier": { + "param a": 0, + "param b": 0.0, + "param c": "option a", + "param d": "option a", } } tuner.add(proposed_params, 0.5) @@ -45,35 +49,35 @@ def test_grid_search_tuner_basic(dummy_pipeline_hyperparameters, tuner = GridSearchTuner(dummy_pipeline_hyperparameters_unicode) proposed_params = tuner.propose() assert proposed_params == { - 'Mock Classifier': { - 'param a': 0, - 'param b': 0.0, - 'param c': 'option a 💩', - 'param d': 'option a' + "Mock Classifier": { + "param a": 0, + "param b": 0.0, + "param c": "option a 💩", + "param d": "option a", } } tuner.add(proposed_params, 0.5) def test_grid_search_tuner_space_types(): - tuner = GridSearchTuner({'Mock Classifier': {'param a': (0, 10)}}) + tuner = GridSearchTuner({"Mock Classifier": {"param a": (0, 10)}}) proposed_params = tuner.propose() - assert proposed_params == {'Mock Classifier': {'param a': 0}} + assert proposed_params == {"Mock Classifier": {"param a": 0}} - tuner = GridSearchTuner({'Mock Classifier': {'param a': (0, 10.0)}}) + tuner = GridSearchTuner({"Mock Classifier": {"param a": (0, 10.0)}}) proposed_params = tuner.propose() - assert proposed_params == {'Mock Classifier': {'param a': 0}} + assert proposed_params == {"Mock Classifier": {"param a": 0}} def test_grid_search_tuner_invalid_space(): bound_error_text = "Upper bound must be greater than lower bound. Parameter lower bound is 1 and upper bound is 0" with pytest.raises(ValueError, match=bound_error_text): - GridSearchTuner({'Mock Classifier': {'param a': (1, 0)}}) + GridSearchTuner({"Mock Classifier": {"param a": (1, 0)}}) def test_grid_search_tuner_valid_space(): - GridSearchTuner({'Mock Classifier': {'param a': 1}}) - GridSearchTuner({'Mock Classifier': {'param a': "param_value"}}) - tuner = GridSearchTuner({'Mock Classifier': {'param a': 3.200}}) + GridSearchTuner({"Mock Classifier": {"param a": 1}}) + GridSearchTuner({"Mock Classifier": {"param a": "param_value"}}) + tuner = GridSearchTuner({"Mock Classifier": {"param a": 3.200}}) proposed_params = tuner.propose() - assert proposed_params == {'Mock Classifier': {}} + assert proposed_params == {"Mock Classifier": {}} diff --git a/evalml/tests/tuner_tests/test_random_search_tuner.py b/evalml/tests/tuner_tests/test_random_search_tuner.py index 7c751190fe..e239d679f6 100644 --- a/evalml/tests/tuner_tests/test_random_search_tuner.py +++ b/evalml/tests/tuner_tests/test_random_search_tuner.py @@ -20,11 +20,18 @@ def test_random_search_tuner_unique_values(dummy_pipeline_hyperparameters): assert len(generated_parameters) == 3 for i in range(3): assert generated_parameters[i].keys() == dummy_pipeline_hyperparameters.keys() - assert generated_parameters[i]['Mock Classifier'].keys() == dummy_pipeline_hyperparameters['Mock Classifier'].keys() + assert ( + generated_parameters[i]["Mock Classifier"].keys() + == dummy_pipeline_hyperparameters["Mock Classifier"].keys() + ) def test_random_search_tuner_no_params(dummy_pipeline_hyperparameters_small): - tuner = RandomSearchTuner(dummy_pipeline_hyperparameters_small, random_seed=random_seed, with_replacement=False) + tuner = RandomSearchTuner( + dummy_pipeline_hyperparameters_small, + random_seed=random_seed, + with_replacement=False, + ) error_text = "Cannot create a unique set of unexplored parameters. Try expanding the search space." with pytest.raises(NoParamsException, match=error_text): for i in range(10): @@ -32,58 +39,76 @@ def test_random_search_tuner_no_params(dummy_pipeline_hyperparameters_small): def test_random_search_tuner_with_replacement(dummy_pipeline_hyperparameters): - tuner = RandomSearchTuner(dummy_pipeline_hyperparameters, random_seed=random_seed, with_replacement=True) + tuner = RandomSearchTuner( + dummy_pipeline_hyperparameters, random_seed=random_seed, with_replacement=True + ) for i in range(10): proposal = tuner.propose() assert isinstance(proposal, dict) assert proposal.keys() == dummy_pipeline_hyperparameters.keys() - assert proposal['Mock Classifier'].keys() == dummy_pipeline_hyperparameters['Mock Classifier'].keys() + assert ( + proposal["Mock Classifier"].keys() + == dummy_pipeline_hyperparameters["Mock Classifier"].keys() + ) -def test_random_search_tuner_basic(dummy_pipeline_hyperparameters, - dummy_pipeline_hyperparameters_unicode): +def test_random_search_tuner_basic( + dummy_pipeline_hyperparameters, dummy_pipeline_hyperparameters_unicode +): tuner = RandomSearchTuner(dummy_pipeline_hyperparameters, random_seed=random_seed) proposed_params = tuner.propose() assert proposed_params == { - 'Mock Classifier': { - 'param a': 5, - 'param b': 8.442657485810175, - 'param c': 'option c', - 'param d': np.inf + "Mock Classifier": { + "param a": 5, + "param b": 8.442657485810175, + "param c": "option c", + "param d": np.inf, } } tuner.add(proposed_params, 0.5) - tuner = RandomSearchTuner(dummy_pipeline_hyperparameters_unicode, random_seed=random_seed) + tuner = RandomSearchTuner( + dummy_pipeline_hyperparameters_unicode, random_seed=random_seed + ) proposed_params = tuner.propose() assert proposed_params == { - 'Mock Classifier': { - 'param a': 5, - 'param b': 8.442657485810175, - 'param c': 'option c 💩', - 'param d': np.inf + "Mock Classifier": { + "param a": 5, + "param b": 8.442657485810175, + "param c": "option c 💩", + "param d": np.inf, } } tuner.add(proposed_params, 0.5) def test_random_search_tuner_space_types(): - tuner = RandomSearchTuner({'Mock Classifier': {'param a': (0, 10)}}, random_seed=random_seed) + tuner = RandomSearchTuner( + {"Mock Classifier": {"param a": (0, 10)}}, random_seed=random_seed + ) proposed_params = tuner.propose() - assert proposed_params == {'Mock Classifier': {'param a': 5}} + assert proposed_params == {"Mock Classifier": {"param a": 5}} - tuner = RandomSearchTuner({'Mock Classifier': {'param a': (0, 10.0)}}, random_seed=random_seed) + tuner = RandomSearchTuner( + {"Mock Classifier": {"param a": (0, 10.0)}}, random_seed=random_seed + ) proposed_params = tuner.propose() - assert proposed_params == {'Mock Classifier': {'param a': 5.488135039273248}} + assert proposed_params == {"Mock Classifier": {"param a": 5.488135039273248}} - tuner = RandomSearchTuner({'Mock Classifier': {'param a': 10.0}}, random_seed=random_seed) + tuner = RandomSearchTuner( + {"Mock Classifier": {"param a": 10.0}}, random_seed=random_seed + ) proposed_params = tuner.propose() - assert proposed_params == {'Mock Classifier': {}} + assert proposed_params == {"Mock Classifier": {}} def test_random_search_tuner_invalid_space(): bound_error_text = "has to be less than the upper bound" with pytest.raises(ValueError, match=bound_error_text): - RandomSearchTuner({'Mock Classifier': {'param a': (1, 0)}}, random_seed=random_seed) + RandomSearchTuner( + {"Mock Classifier": {"param a": (1, 0)}}, random_seed=random_seed + ) with pytest.raises(ValueError, match=bound_error_text): - RandomSearchTuner({'Mock Classifier': {'param a': (0, 0)}}, random_seed=random_seed) + RandomSearchTuner( + {"Mock Classifier": {"param a": (0, 0)}}, random_seed=random_seed + ) diff --git a/evalml/tests/tuner_tests/test_skopt_tuner.py b/evalml/tests/tuner_tests/test_skopt_tuner.py index 84e3b9a8e1..e01c94d128 100644 --- a/evalml/tests/tuner_tests/test_skopt_tuner.py +++ b/evalml/tests/tuner_tests/test_skopt_tuner.py @@ -13,23 +13,41 @@ def test_tuner_init(): - with pytest.raises(TypeError, match="Can't instantiate abstract class Tuner with abstract methods add, propose"): + with pytest.raises( + TypeError, + match="Can't instantiate abstract class Tuner with abstract methods add, propose", + ): Tuner({}) def test_skopt_tuner_init(): - with pytest.raises(ValueError, match='pipeline_hyperparameter_ranges must be a dict but is of type '): - SKOptTuner({'My Component'}) - with pytest.raises(ValueError, match='pipeline_hyperparameter_ranges has invalid entry for My Component: True'): - SKOptTuner({'My Component': True}) - with pytest.raises(ValueError, match='pipeline_hyperparameter_ranges has invalid entry for My Component'): - SKOptTuner({'My Component': 0}) - with pytest.raises(ValueError, match='pipeline_hyperparameter_ranges has invalid entry for My Component'): - SKOptTuner({'My Component': None}) - with pytest.raises(ValueError, match='pipeline_hyperparameter_ranges has invalid dimensions for My Component parameter param a: None'): - SKOptTuner({'My Component': {'param a': None}}) + with pytest.raises( + ValueError, + match="pipeline_hyperparameter_ranges must be a dict but is of type ", + ): + SKOptTuner({"My Component"}) + with pytest.raises( + ValueError, + match="pipeline_hyperparameter_ranges has invalid entry for My Component: True", + ): + SKOptTuner({"My Component": True}) + with pytest.raises( + ValueError, + match="pipeline_hyperparameter_ranges has invalid entry for My Component", + ): + SKOptTuner({"My Component": 0}) + with pytest.raises( + ValueError, + match="pipeline_hyperparameter_ranges has invalid entry for My Component", + ): + SKOptTuner({"My Component": None}) + with pytest.raises( + ValueError, + match="pipeline_hyperparameter_ranges has invalid dimensions for My Component parameter param a: None", + ): + SKOptTuner({"My Component": {"param a": None}}) SKOptTuner({}) - SKOptTuner({'My Component': {}}) + SKOptTuner({"My Component": {}}) def test_skopt_tuner_is_search_space_exhausted(): @@ -38,121 +56,221 @@ def test_skopt_tuner_is_search_space_exhausted(): def test_skopt_tuner_basic(): - pipeline_hyperparameter_ranges = {'Mock Classifier': { - 'parameter a': Integer(0, 10), - 'parameter b': Real(0, 10), - 'parameter c': (0, 10), - 'parameter d': (0, 10.0), - 'parameter e': ['option a', 'option b', 'option c'], - 'parameter f': ['option a 💩', 'option b 💩', 'option c 💩'], - 'parameter g': ['option a', 'option b', 100, np.inf] - }} + pipeline_hyperparameter_ranges = { + "Mock Classifier": { + "parameter a": Integer(0, 10), + "parameter b": Real(0, 10), + "parameter c": (0, 10), + "parameter d": (0, 10.0), + "parameter e": ["option a", "option b", "option c"], + "parameter f": ["option a 💩", "option b 💩", "option c 💩"], + "parameter g": ["option a", "option b", 100, np.inf], + } + } tuner = SKOptTuner(pipeline_hyperparameter_ranges, random_seed=random_seed) assert isinstance(tuner, Tuner) proposed_params = tuner.propose() assert proposed_params == { - 'Mock Classifier': { - 'parameter a': 5, - 'parameter b': 8.442657485810175, - 'parameter c': 3, - 'parameter d': 8.472517387841256, - 'parameter e': 'option b', - 'parameter f': 'option b 💩', - 'parameter g': 'option b' + "Mock Classifier": { + "parameter a": 5, + "parameter b": 8.442657485810175, + "parameter c": 3, + "parameter d": 8.472517387841256, + "parameter e": "option b", + "parameter f": "option b 💩", + "parameter g": "option b", } } tuner.add(proposed_params, 0.5) def test_skopt_tuner_invalid_ranges(): - SKOptTuner({'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': ['option a', 'option b', 'option c'] - }}, random_seed=random_seed) - - with pytest.raises(ValueError, match="Invalid dimension \\[\\]. Read the documentation for supported types."): - SKOptTuner({'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': [] - }}, random_seed=random_seed) - with pytest.raises(ValueError, match="pipeline_hyperparameter_ranges has invalid dimensions for Mock Classifier parameter param c"): - SKOptTuner({'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': None - }}, random_seed=random_seed) + SKOptTuner( + { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": ["option a", "option b", "option c"], + } + }, + random_seed=random_seed, + ) + + with pytest.raises( + ValueError, + match="Invalid dimension \\[\\]. Read the documentation for supported types.", + ): + SKOptTuner( + { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": [], + } + }, + random_seed=random_seed, + ) + with pytest.raises( + ValueError, + match="pipeline_hyperparameter_ranges has invalid dimensions for Mock Classifier parameter param c", + ): + SKOptTuner( + { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": None, + } + }, + random_seed=random_seed, + ) def test_skopt_tuner_single_value(): - SKOptTuner({'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': 'Value' - }}, random_seed=random_seed) + SKOptTuner( + { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": "Value", + } + }, + random_seed=random_seed, + ) - tuner = SKOptTuner({'Mock Classifier': { - 'param c': 10 - }}, random_seed=random_seed) + tuner = SKOptTuner({"Mock Classifier": {"param c": 10}}, random_seed=random_seed) proposed_params = tuner.propose() - assert proposed_params == {'Mock Classifier': {}} + assert proposed_params == {"Mock Classifier": {}} def test_skopt_tuner_invalid_parameters_score(): - pipeline_hyperparameter_ranges = {'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': ['option a', 'option b', 'option c'] - }} + pipeline_hyperparameter_ranges = { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": ["option a", "option b", "option c"], + } + } tuner = SKOptTuner(pipeline_hyperparameter_ranges, random_seed=random_seed) - with pytest.raises(TypeError, match='Pipeline parameters missing required field "param a" for component "Mock Classifier"'): + with pytest.raises( + TypeError, + match='Pipeline parameters missing required field "param a" for component "Mock Classifier"', + ): tuner.add({}, 0.5) - with pytest.raises(TypeError, match='Pipeline parameters missing required field "param a" for component "Mock Classifier"'): - tuner.add({'Mock Classifier': {}}, 0.5) - with pytest.raises(TypeError, match='Pipeline parameters missing required field "param b" for component "Mock Classifier"'): - tuner.add({'Mock Classifier': {'param a': 0}}, 0.5) + with pytest.raises( + TypeError, + match='Pipeline parameters missing required field "param a" for component "Mock Classifier"', + ): + tuner.add({"Mock Classifier": {}}, 0.5) + with pytest.raises( + TypeError, + match='Pipeline parameters missing required field "param b" for component "Mock Classifier"', + ): + tuner.add({"Mock Classifier": {"param a": 0}}, 0.5) with pytest.raises(ValueError, match="is not within the bounds of the space"): - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 0.0, 'param c': 0}}, 0.5) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 0.0, "param c": 0}}, 0.5 + ) with pytest.raises(ValueError, match="is not within the bounds of the space"): - tuner.add({'Mock Classifier': {'param a': -1, 'param b': 0.0, 'param c': 'option a'}}, 0.5) + tuner.add( + {"Mock Classifier": {"param a": -1, "param b": 0.0, "param c": "option a"}}, + 0.5, + ) with pytest.raises(ValueError, match="is not within the bounds of the space"): - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 11.0, 'param c': 'option a'}}, 0.5) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 11.0, "param c": "option a"}}, + 0.5, + ) with pytest.raises(ValueError, match="is not within the bounds of the space"): - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 0.0, 'param c': 'option d'}}, 0.5) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 0.0, "param c": "option d"}}, + 0.5, + ) with pytest.raises(ValueError, match="is not within the bounds of the space"): - tuner.add({'Mock Classifier': {'param a': np.nan, 'param b': 0.0, 'param c': 'option a'}}, 0.5) + tuner.add( + { + "Mock Classifier": { + "param a": np.nan, + "param b": 0.0, + "param c": "option a", + } + }, + 0.5, + ) with pytest.raises(ValueError, match="is not within the bounds of the space"): - tuner.add({'Mock Classifier': {'param a': np.inf, 'param b': 0.0, 'param c': 'option a'}}, 0.5) - with pytest.raises(ParameterError, match="Invalid parameters specified to SKOptTuner.add"): - tuner.add({'Mock Classifier': {'param a': None, 'param b': 0.0, 'param c': 'option a'}}, 0.5) - with patch('evalml.tuners.skopt_tuner.Optimizer.tell') as mock_optimizer_tell: - msg = 'Mysterious internal error' + tuner.add( + { + "Mock Classifier": { + "param a": np.inf, + "param b": 0.0, + "param c": "option a", + } + }, + 0.5, + ) + with pytest.raises( + ParameterError, match="Invalid parameters specified to SKOptTuner.add" + ): + tuner.add( + { + "Mock Classifier": { + "param a": None, + "param b": 0.0, + "param c": "option a", + } + }, + 0.5, + ) + with patch("evalml.tuners.skopt_tuner.Optimizer.tell") as mock_optimizer_tell: + msg = "Mysterious internal error" mock_optimizer_tell.side_effect = Exception(msg) with pytest.raises(Exception, match=msg): - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 0.0, 'param c': 'option a'}}, 0.5) - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 1.0, 'param c': 'option a'}}, 0.5) - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 1.0, 'param c': 'option a'}}, np.nan) - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 1.0, 'param c': 'option a'}}, np.inf) - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 1.0, 'param c': 'option a'}}, None) + tuner.add( + { + "Mock Classifier": { + "param a": 0, + "param b": 0.0, + "param c": "option a", + } + }, + 0.5, + ) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 1.0, "param c": "option a"}}, 0.5 + ) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 1.0, "param c": "option a"}}, + np.nan, + ) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 1.0, "param c": "option a"}}, + np.inf, + ) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 1.0, "param c": "option a"}}, None + ) tuner.propose() def test_skopt_tuner_propose(): - pipeline_hyperparameter_ranges = {'Mock Classifier': { - 'param a': Integer(0, 10), - 'param b': Real(0, 10), - 'param c': ['option a', 'option b', 'option c'] - }} + pipeline_hyperparameter_ranges = { + "Mock Classifier": { + "param a": Integer(0, 10), + "param b": Real(0, 10), + "param c": ["option a", "option b", "option c"], + } + } tuner = SKOptTuner(pipeline_hyperparameter_ranges, random_seed=random_seed) - tuner.add({'Mock Classifier': {'param a': 0, 'param b': 1.0, 'param c': 'option a'}}, 0.5) + tuner.add( + {"Mock Classifier": {"param a": 0, "param b": 1.0, "param c": "option a"}}, 0.5 + ) parameters = tuner.propose() assert parameters == { - 'Mock Classifier': { - 'param a': 5, - 'param b': 8.442657485810175, - 'param c': 'option c' + "Mock Classifier": { + "param a": 5, + "param b": 8.442657485810175, + "param c": "option c", } } diff --git a/evalml/tests/utils_tests/test_cli_utils.py b/evalml/tests/utils_tests/test_cli_utils.py index 587e25192a..e7d8182c70 100644 --- a/evalml/tests/utils_tests/test_cli_utils.py +++ b/evalml/tests/utils_tests/test_cli_utils.py @@ -13,7 +13,7 @@ get_sys_info, print_deps, print_info, - print_sys_info + print_sys_info, ) @@ -23,10 +23,12 @@ def current_dir(): def get_core_requirements(current_dir): - reqs_path = os.path.join(current_dir, pathlib.Path('..', '..', '..', 'core-requirements.txt')) - lines = open(reqs_path, 'r').readlines() - lines = [line for line in lines if '-r ' not in line] - reqs = requirements.parse(''.join(lines)) + reqs_path = os.path.join( + current_dir, pathlib.Path("..", "..", "..", "core-requirements.txt") + ) + lines = open(reqs_path, "r").readlines() + lines = [line for line in lines if "-r " not in line] + reqs = requirements.parse("".join(lines)) reqs_names = [req.name for req in reqs] return reqs_names @@ -40,7 +42,7 @@ def test_print_cli_cmd(): def test_print_cli_info_cmd(caplog): runner = CliRunner() - result = runner.invoke(cli, ['info']) + result = runner.invoke(cli, ["info"]) assert result.exit_code == 0 assert "EvalML version:" in caplog.text assert "EvalML installation directory:" in caplog.text @@ -74,15 +76,25 @@ def test_print_deps_info(caplog, current_dir): def test_sys_info(): sys_info = get_sys_info() - info_keys = ["python", "python-bits", "OS", - "OS-release", "machine", "processor", - "byteorder", "LC_ALL", "LANG", "LOCALE", - "# of CPUS", "Available memory"] + info_keys = [ + "python", + "python-bits", + "OS", + "OS-release", + "machine", + "processor", + "byteorder", + "LC_ALL", + "LANG", + "LOCALE", + "# of CPUS", + "Available memory", + ] found_keys = [k for k, _ in sys_info] assert set(info_keys).issubset(found_keys) -@patch('platform.uname') +@patch("platform.uname") def test_sys_info_error(mock_uname): mock_uname.side_effects = ValueError() assert len(get_sys_info()) == 0 @@ -96,5 +108,5 @@ def test_installed_packages(current_dir): def test_get_evalml_root(current_dir): - root = os.path.abspath(os.path.join(current_dir, '..', "..")) + root = os.path.abspath(os.path.join(current_dir, "..", "..")) assert get_evalml_root() == root diff --git a/evalml/tests/utils_tests/test_dependencies.py b/evalml/tests/utils_tests/test_dependencies.py index e066384357..fc7edd18f3 100644 --- a/evalml/tests/utils_tests/test_dependencies.py +++ b/evalml/tests/utils_tests/test_dependencies.py @@ -6,23 +6,29 @@ def _get_req_name(name): - if name == 'imbalanced-learn': - return 'imblearn' - elif name == 'MarkupSafe': - return 'markupsafe' + if name == "imbalanced-learn": + return "imblearn" + elif name == "MarkupSafe": + return "markupsafe" return name -def test_has_minimal_deps(has_minimal_dependencies, is_running_py_39_or_above, is_using_conda): - reqs_path = pathlib.Path(__file__).absolute().parents[3].joinpath('requirements.txt') - lines = open(reqs_path, 'r').readlines() - lines = [line for line in lines if '-r ' not in line] - reqs = requirements.parse(''.join(lines)) +def test_has_minimal_deps( + has_minimal_dependencies, is_running_py_39_or_above, is_using_conda +): + reqs_path = ( + pathlib.Path(__file__).absolute().parents[3].joinpath("requirements.txt") + ) + lines = open(reqs_path, "r").readlines() + lines = [line for line in lines if "-r " not in line] + reqs = requirements.parse("".join(lines)) extra_deps = [_get_req_name(req.name) for req in reqs] - extra_deps += ['plotly.graph_objects'] + extra_deps += ["plotly.graph_objects"] for module in extra_deps: # We don't expect to install sktime in python 3.9. Let's verify it's not present: - if (module == 'sktime' and is_running_py_39_or_above) or (module == 'pmdarima' and is_using_conda): + if (module == "sktime" and is_running_py_39_or_above) or ( + module == "pmdarima" and is_using_conda + ): with pytest.raises(ModuleNotFoundError): import_module(module) continue @@ -30,16 +36,20 @@ def test_has_minimal_deps(has_minimal_dependencies, is_running_py_39_or_above, i try: import_module(module) # an extra dep was imported. if the tests were configured with --has-minimal-deps, that's an error. - assert not has_minimal_dependencies, ("The test environment includes extra dependency '{}', " + - "but tests were configured with " + - "'--has-minimal-dependencies'. Please either uninstall " + - "all extra dependencies as listed in requirements.txt, " + - "or rerun the tests without " + - "'--has-minimal-dependencies'.").format(module) + assert not has_minimal_dependencies, ( + "The test environment includes extra dependency '{}', " + + "but tests were configured with " + + "'--has-minimal-dependencies'. Please either uninstall " + + "all extra dependencies as listed in requirements.txt, " + + "or rerun the tests without " + + "'--has-minimal-dependencies'." + ).format(module) except ImportError: # an extra dep failed to import. if the tests were configured with --has-minimal-deps, that's # expected. otherwise, it's an error. - assert has_minimal_dependencies, ("The test environment is missing expected extra dependency '{}'. " + - "Please either install all requirements in requirements.txt, " + - "or rerun the tests with " + - "'--has-minimal-dependencies'.").format(module) + assert has_minimal_dependencies, ( + "The test environment is missing expected extra dependency '{}'. " + + "Please either install all requirements in requirements.txt, " + + "or rerun the tests with " + + "'--has-minimal-dependencies'." + ).format(module) diff --git a/evalml/tests/utils_tests/test_gen_utils.py b/evalml/tests/utils_tests/test_gen_utils.py index 4a1b285986..fd8ba3fa80 100644 --- a/evalml/tests/utils_tests/test_gen_utils.py +++ b/evalml/tests/utils_tests/test_gen_utils.py @@ -21,11 +21,11 @@ import_or_raise, jupyter_check, pad_with_nans, - save_plot + save_plot, ) -@patch('importlib.import_module') +@patch("importlib.import_module") def test_import_or_raise_errors(dummy_importlib): def _mock_import_function(library_str): if library_str == "_evalml": @@ -37,9 +37,15 @@ def _mock_import_function(library_str): with pytest.raises(ImportError, match="Missing optional dependency '_evalml'"): import_or_raise("_evalml") - with pytest.raises(ImportError, match="Missing optional dependency '_evalml'. Please use pip to install _evalml. Additional error message"): + with pytest.raises( + ImportError, + match="Missing optional dependency '_evalml'. Please use pip to install _evalml. Additional error message", + ): import_or_raise("_evalml", "Additional error message") - with pytest.raises(Exception, match="An exception occurred while trying to import `attr_error_lib`: Mock Exception executed!"): + with pytest.raises( + Exception, + match="An exception occurred while trying to import `attr_error_lib`: Mock Exception executed!", + ): import_or_raise("attr_error_lib") @@ -69,9 +75,7 @@ def test_convert_to_seconds(): def test_get_random_seed_rng(): - def make_mock_random_state(return_value): - class MockRandomState(np.random.RandomState): def __init__(self): self.min_bound = None @@ -82,6 +86,7 @@ def randint(self, min_bound, max_bound): self.min_bound = min_bound self.max_bound = max_bound return return_value + return MockRandomState() rng = make_mock_random_state(42) @@ -110,17 +115,24 @@ def test_get_random_seed_int(): assert get_random_seed(SEED_BOUNDS.min_bound + 2) == SEED_BOUNDS.min_bound + 2 # vectorize get_random_seed via a wrapper for easy evaluation - default_min_bound = inspect.signature(get_random_seed).parameters['min_bound'].default - default_max_bound = inspect.signature(get_random_seed).parameters['max_bound'].default + default_min_bound = ( + inspect.signature(get_random_seed).parameters["min_bound"].default + ) + default_max_bound = ( + inspect.signature(get_random_seed).parameters["max_bound"].default + ) assert default_min_bound == SEED_BOUNDS.min_bound assert default_max_bound == SEED_BOUNDS.max_bound - def get_random_seed_vec(min_bound=None, max_bound=None): # passing None for either means no value is provided to get_random_seed - + def get_random_seed_vec( + min_bound=None, max_bound=None + ): # passing None for either means no value is provided to get_random_seed def get_random_seed_wrapper(random_seed): - return get_random_seed(random_seed, - min_bound=min_bound if min_bound is not None else default_min_bound, - max_bound=max_bound if max_bound is not None else default_max_bound) + return get_random_seed( + random_seed, + min_bound=min_bound if min_bound is not None else default_min_bound, + max_bound=max_bound if max_bound is not None else default_max_bound, + ) return np.vectorize(get_random_seed_wrapper) @@ -129,25 +141,49 @@ def get_random_seed_wrapper(random_seed): vals = np.arange(-100, 100) def make_expected_values(vals, min_bound, max_bound): - return np.array([i if (min_bound <= i and i < max_bound) else ((i - min_bound) % (max_bound - min_bound)) + min_bound - for i in vals]) - - np.testing.assert_equal(get_random_seed_vec(min_bound=None, max_bound=None)(vals), - make_expected_values(vals, min_bound=SEED_BOUNDS.min_bound, max_bound=SEED_BOUNDS.max_bound)) - np.testing.assert_equal(get_random_seed_vec(min_bound=None, max_bound=10)(vals), - make_expected_values(vals, min_bound=SEED_BOUNDS.min_bound, max_bound=10)) - np.testing.assert_equal(get_random_seed_vec(min_bound=-10, max_bound=None)(vals), - make_expected_values(vals, min_bound=-10, max_bound=SEED_BOUNDS.max_bound)) - np.testing.assert_equal(get_random_seed_vec(min_bound=0, max_bound=5)(vals), - make_expected_values(vals, min_bound=0, max_bound=5)) - np.testing.assert_equal(get_random_seed_vec(min_bound=-5, max_bound=0)(vals), - make_expected_values(vals, min_bound=-5, max_bound=0)) - np.testing.assert_equal(get_random_seed_vec(min_bound=-5, max_bound=5)(vals), - make_expected_values(vals, min_bound=-5, max_bound=5)) - np.testing.assert_equal(get_random_seed_vec(min_bound=5, max_bound=10)(vals), - make_expected_values(vals, min_bound=5, max_bound=10)) - np.testing.assert_equal(get_random_seed_vec(min_bound=-10, max_bound=-5)(vals), - make_expected_values(vals, min_bound=-10, max_bound=-5)) + return np.array( + [ + i + if (min_bound <= i and i < max_bound) + else ((i - min_bound) % (max_bound - min_bound)) + min_bound + for i in vals + ] + ) + + np.testing.assert_equal( + get_random_seed_vec(min_bound=None, max_bound=None)(vals), + make_expected_values( + vals, min_bound=SEED_BOUNDS.min_bound, max_bound=SEED_BOUNDS.max_bound + ), + ) + np.testing.assert_equal( + get_random_seed_vec(min_bound=None, max_bound=10)(vals), + make_expected_values(vals, min_bound=SEED_BOUNDS.min_bound, max_bound=10), + ) + np.testing.assert_equal( + get_random_seed_vec(min_bound=-10, max_bound=None)(vals), + make_expected_values(vals, min_bound=-10, max_bound=SEED_BOUNDS.max_bound), + ) + np.testing.assert_equal( + get_random_seed_vec(min_bound=0, max_bound=5)(vals), + make_expected_values(vals, min_bound=0, max_bound=5), + ) + np.testing.assert_equal( + get_random_seed_vec(min_bound=-5, max_bound=0)(vals), + make_expected_values(vals, min_bound=-5, max_bound=0), + ) + np.testing.assert_equal( + get_random_seed_vec(min_bound=-5, max_bound=5)(vals), + make_expected_values(vals, min_bound=-5, max_bound=5), + ) + np.testing.assert_equal( + get_random_seed_vec(min_bound=5, max_bound=10)(vals), + make_expected_values(vals, min_bound=5, max_bound=10), + ) + np.testing.assert_equal( + get_random_seed_vec(min_bound=-10, max_bound=-5)(vals), + make_expected_values(vals, min_bound=-10, max_bound=-5), + ) def test_class_property(): @@ -162,14 +198,13 @@ def caps_name(cls): def test_get_importable_subclasses_wont_get_custom_classes(): - class ChildClass(ComponentBase): pass assert ChildClass not in get_importable_subclasses(ComponentBase) -@patch('importlib.import_module') +@patch("importlib.import_module") def test_import_or_warn_errors(dummy_importlib): def _mock_import_function(library_str): if library_str == "_evalml": @@ -181,13 +216,19 @@ def _mock_import_function(library_str): with pytest.warns(UserWarning, match="Missing optional dependency '_evalml'"): import_or_raise("_evalml", warning=True) - with pytest.warns(UserWarning, match="Missing optional dependency '_evalml'. Please use pip to install _evalml. Additional error message"): + with pytest.warns( + UserWarning, + match="Missing optional dependency '_evalml'. Please use pip to install _evalml. Additional error message", + ): import_or_raise("_evalml", "Additional error message", warning=True) - with pytest.warns(UserWarning, match="An exception occurred while trying to import `attr_error_lib`: Mock Exception executed!"): + with pytest.warns( + UserWarning, + match="An exception occurred while trying to import `attr_error_lib`: Mock Exception executed!", + ): import_or_raise("attr_error_lib", warning=True) -@patch('evalml.utils.gen_utils.import_or_raise') +@patch("evalml.utils.gen_utils.import_or_raise") def test_jupyter_check_errors(mock_import_or_raise): mock_import_or_raise.side_effect = ImportError assert not jupyter_check() @@ -196,7 +237,7 @@ def test_jupyter_check_errors(mock_import_or_raise): assert not jupyter_check() -@patch('evalml.utils.gen_utils.import_or_raise') +@patch("evalml.utils.gen_utils.import_or_raise") def test_jupyter_check(mock_import_or_raise): mock_import_or_raise.return_value = MagicMock() mock_import_or_raise().core.getipython.get_ipython.return_value = True @@ -214,17 +255,43 @@ def _check_equality(data, expected, check_index_type=True): pd.testing.assert_frame_equal(data, expected, check_index_type) -@pytest.mark.parametrize("data,num_to_pad,expected", - [(pd.Series([1, 2, 3]), 1, pd.Series([np.nan, 1, 2, 3], dtype="float64")), - (pd.Series([1, 2, 3]), 0, pd.Series([1, 2, 3])), - (pd.Series([1, 2, 3, 4], index=pd.date_range("2020-10-01", "2020-10-04")), - 2, pd.Series([np.nan, np.nan, 1, 2, 3, 4], dtype="float64")), - (pd.DataFrame({"a": [1., 2., 3.], "b": [4., 5., 6.]}), 0, - pd.DataFrame({"a": pd.Series([1., 2., 3.], dtype="float64"), "b": pd.Series([4., 5., 6.], dtype="float64")})), - (pd.DataFrame({"a": [4, 5, 6], "b": ["a", "b", "c"]}), 1, - pd.DataFrame({"a": pd.Series([np.nan, 4, 5, 6], dtype="float64"), "b": [np.nan, "a", "b", "c"]})), - (pd.DataFrame({"a": [1, 0, 1]}), 2, - pd.DataFrame({"a": pd.Series([np.nan, np.nan, 1, 0, 1], dtype="float64")}))]) +@pytest.mark.parametrize( + "data,num_to_pad,expected", + [ + (pd.Series([1, 2, 3]), 1, pd.Series([np.nan, 1, 2, 3], dtype="float64")), + (pd.Series([1, 2, 3]), 0, pd.Series([1, 2, 3])), + ( + pd.Series([1, 2, 3, 4], index=pd.date_range("2020-10-01", "2020-10-04")), + 2, + pd.Series([np.nan, np.nan, 1, 2, 3, 4], dtype="float64"), + ), + ( + pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}), + 0, + pd.DataFrame( + { + "a": pd.Series([1.0, 2.0, 3.0], dtype="float64"), + "b": pd.Series([4.0, 5.0, 6.0], dtype="float64"), + } + ), + ), + ( + pd.DataFrame({"a": [4, 5, 6], "b": ["a", "b", "c"]}), + 1, + pd.DataFrame( + { + "a": pd.Series([np.nan, 4, 5, 6], dtype="float64"), + "b": [np.nan, "a", "b", "c"], + } + ), + ), + ( + pd.DataFrame({"a": [1, 0, 1]}), + 2, + pd.DataFrame({"a": pd.Series([np.nan, np.nan, 1, 0, 1], dtype="float64")}), + ), + ], +) def test_pad_with_nans(data, num_to_pad, expected): padded = pad_with_nans(data, num_to_pad) _check_equality(padded, expected) @@ -237,18 +304,39 @@ def test_pad_with_nans_with_series_name(): _check_equality(padded, pd.Series([np.nan, 1, 2, 3], name=name, dtype="float64")) -@pytest.mark.parametrize("data, expected", - [([pd.Series([None, 1., 2., 3]), pd.DataFrame({"a": [1., 2., 3, None]})], - [pd.Series([1., 2.], index=pd.Int64Index([1, 2])), - pd.DataFrame({"a": [2., 3.]}, index=pd.Int64Index([1, 2]))]), - ([pd.Series([None, 1., 2., 3]), pd.DataFrame({"a": [3., 4., None, None]})], - [pd.Series([1.], index=pd.Int64Index([1])), - pd.DataFrame({"a": [4.]}, index=pd.Int64Index([1]))]), - ([pd.DataFrame(), pd.Series([None, 1., 2., 3.])], - [pd.DataFrame(), pd.Series([1., 2., 3.], index=pd.Int64Index([1, 2, 3]))]), - ([pd.DataFrame({"a": [1., 2., None]}), pd.Series([])], - [pd.DataFrame({"a": [1., 2.]}), pd.Series([])]) - ]) +@pytest.mark.parametrize( + "data, expected", + [ + ( + [pd.Series([None, 1.0, 2.0, 3]), pd.DataFrame({"a": [1.0, 2.0, 3, None]})], + [ + pd.Series([1.0, 2.0], index=pd.Int64Index([1, 2])), + pd.DataFrame({"a": [2.0, 3.0]}, index=pd.Int64Index([1, 2])), + ], + ), + ( + [ + pd.Series([None, 1.0, 2.0, 3]), + pd.DataFrame({"a": [3.0, 4.0, None, None]}), + ], + [ + pd.Series([1.0], index=pd.Int64Index([1])), + pd.DataFrame({"a": [4.0]}, index=pd.Int64Index([1])), + ], + ), + ( + [pd.DataFrame(), pd.Series([None, 1.0, 2.0, 3.0])], + [ + pd.DataFrame(), + pd.Series([1.0, 2.0, 3.0], index=pd.Int64Index([1, 2, 3])), + ], + ), + ( + [pd.DataFrame({"a": [1.0, 2.0, None]}), pd.Series([])], + [pd.DataFrame({"a": [1.0, 2.0]}), pd.Series([])], + ), + ], +) def test_drop_nan(data, expected): no_nan_1, no_nan_2 = drop_rows_with_nans(*data) _check_equality(no_nan_1, expected[0], check_index_type=False) @@ -260,180 +348,323 @@ def test_rename_column_names_to_numeric(): pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame(X)) X = pd.DataFrame({"<>": [1, 2], ">>": [2, 4]}) - pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame({0: [1, 2], 1: [2, 4]})) + pd.testing.assert_frame_equal( + _rename_column_names_to_numeric(X), pd.DataFrame({0: [1, 2], 1: [2, 4]}) + ) X.ww.init(logical_types={"<>": "categorical", ">>": "categorical"}) X_renamed = _rename_column_names_to_numeric(X) - X_expected = pd.DataFrame({0: pd.Series([1, 2], dtype="category"), 1: pd.Series([2, 4], dtype="category")}) + X_expected = pd.DataFrame( + {0: pd.Series([1, 2], dtype="category"), 1: pd.Series([2, 4], dtype="category")} + ) pd.testing.assert_frame_equal(X_renamed, X_expected) -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('test_plot', 'png', False), - ('test_plot.png', 'png', False), - ('test_plot.', 'png', False), - ('test_plot.png', 'jpeg', False) - ]) -def test_save_plotly_static_default_format(file_name, format, interactive, decision_tree_classification_pipeline_class, tmpdir, has_minimal_dependencies): +@pytest.mark.parametrize( + "file_name,format,interactive", + [ + ("test_plot", "png", False), + ("test_plot.png", "png", False), + ("test_plot.", "png", False), + ("test_plot.png", "jpeg", False), + ], +) +def test_save_plotly_static_default_format( + file_name, + format, + interactive, + decision_tree_classification_pipeline_class, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: pipeline = decision_tree_classification_pipeline_class feat_fig_ = pipeline.graph_feature_importance() - filepath = os.path.join(str(tmpdir), f'{file_name}') - no_output_ = save_plot(fig=feat_fig_, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=feat_fig_, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") + no_output_ = save_plot( + fig=feat_fig_, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=feat_fig_, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.png' - - -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('test_plot', 'jpeg', False) - ]) -def test_save_plotly_static_different_format(file_name, format, interactive, decision_tree_classification_pipeline_class, tmpdir, has_minimal_dependencies): + assert os.path.basename(output_) == "test_plot.png" + + +@pytest.mark.parametrize("file_name,format,interactive", [("test_plot", "jpeg", False)]) +def test_save_plotly_static_different_format( + file_name, + format, + interactive, + decision_tree_classification_pipeline_class, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: pipeline = decision_tree_classification_pipeline_class feat_fig_ = pipeline.graph_feature_importance() - filepath = os.path.join(str(tmpdir), f'{file_name}') - no_output_ = save_plot(fig=feat_fig_, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=feat_fig_, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") + no_output_ = save_plot( + fig=feat_fig_, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=feat_fig_, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.jpeg' - - -@pytest.mark.parametrize("file_name,format,interactive", - [ - (None, 'jpeg', False) - ]) -def test_save_plotly_static_no_filepath(file_name, format, interactive, decision_tree_classification_pipeline_class, tmpdir, has_minimal_dependencies): + assert os.path.basename(output_) == "test_plot.jpeg" + + +@pytest.mark.parametrize("file_name,format,interactive", [(None, "jpeg", False)]) +def test_save_plotly_static_no_filepath( + file_name, + format, + interactive, + decision_tree_classification_pipeline_class, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: pipeline = decision_tree_classification_pipeline_class feat_fig_ = pipeline.graph_feature_importance() - filepath = os.path.join(str(tmpdir), f'{file_name}') if file_name else None - output_ = save_plot(fig=feat_fig_, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") if file_name else None + output_ = save_plot( + fig=feat_fig_, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.jpeg' - os.remove('test_plot.jpeg') - - -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('test_plot', 'html', True), - ('test_plot.png', 'html', True), - ('test_plot.', 'html', True), - ('test_plot.png', 'jpeg', True), - ('test_plot', None, True), - ('test_plot.html', None, True) - ]) -def test_save_plotly_interactive(file_name, format, interactive, decision_tree_classification_pipeline_class, tmpdir, has_minimal_dependencies): + assert os.path.basename(output_) == "test_plot.jpeg" + os.remove("test_plot.jpeg") + + +@pytest.mark.parametrize( + "file_name,format,interactive", + [ + ("test_plot", "html", True), + ("test_plot.png", "html", True), + ("test_plot.", "html", True), + ("test_plot.png", "jpeg", True), + ("test_plot", None, True), + ("test_plot.html", None, True), + ], +) +def test_save_plotly_interactive( + file_name, + format, + interactive, + decision_tree_classification_pipeline_class, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: pipeline = decision_tree_classification_pipeline_class feat_fig_ = pipeline.graph_feature_importance() - filepath = os.path.join(str(tmpdir), f'{file_name}') if file_name else None - no_output_ = save_plot(fig=feat_fig_, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=feat_fig_, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") if file_name else None + no_output_ = save_plot( + fig=feat_fig_, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=feat_fig_, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.html' + assert os.path.basename(output_) == "test_plot.html" -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('test_plot', 'png', False), - ('test_plot.png', 'png', False), - ('test_plot.', 'png', False) - ]) -def test_save_graphviz_default_format(file_name, format, interactive, fitted_tree_estimators, tmpdir, has_minimal_dependencies): +@pytest.mark.parametrize( + "file_name,format,interactive", + [ + ("test_plot", "png", False), + ("test_plot.png", "png", False), + ("test_plot.", "png", False), + ], +) +def test_save_graphviz_default_format( + file_name, + format, + interactive, + fitted_tree_estimators, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: est_class, _ = fitted_tree_estimators src = visualize_decision_tree(estimator=est_class, filled=True, max_depth=3) - filepath = os.path.join(str(tmpdir), f'{file_name}') if file_name else None - no_output_ = save_plot(fig=src, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=src, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") if file_name else None + no_output_ = save_plot( + fig=src, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=src, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.png' - - -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('test_plot', 'jpeg', False) - ]) -def test_save_graphviz_different_format(file_name, format, interactive, fitted_tree_estimators, tmpdir, has_minimal_dependencies): + assert os.path.basename(output_) == "test_plot.png" + + +@pytest.mark.parametrize("file_name,format,interactive", [("test_plot", "jpeg", False)]) +def test_save_graphviz_different_format( + file_name, + format, + interactive, + fitted_tree_estimators, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: est_class, _ = fitted_tree_estimators src = visualize_decision_tree(estimator=est_class, filled=True, max_depth=3) - filepath = os.path.join(str(tmpdir), f'{file_name}') - no_output_ = save_plot(fig=src, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=src, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") + no_output_ = save_plot( + fig=src, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=src, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.png' + assert os.path.basename(output_) == "test_plot.png" -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('Output/in_folder_plot', 'jpeg', True) - ]) -def test_save_graphviz_invalid_filepath(file_name, format, interactive, fitted_tree_estimators, tmpdir, has_minimal_dependencies): +@pytest.mark.parametrize( + "file_name,format,interactive", [("Output/in_folder_plot", "jpeg", True)] +) +def test_save_graphviz_invalid_filepath( + file_name, + format, + interactive, + fitted_tree_estimators, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: est_class, _ = fitted_tree_estimators src = visualize_decision_tree(estimator=est_class, filled=True, max_depth=3) - filepath = f'{file_name}.{format}' + filepath = f"{file_name}.{format}" with pytest.raises(ValueError, match="Specified filepath is not writeable"): - save_plot(fig=src, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - - -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('example_plot', None, False), - ('example_plot', 'png', False) - ]) -def test_save_graphviz_different_filename_output(file_name, format, interactive, fitted_tree_estimators, tmpdir, has_minimal_dependencies): + save_plot( + fig=src, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + + +@pytest.mark.parametrize( + "file_name,format,interactive", + [("example_plot", None, False), ("example_plot", "png", False)], +) +def test_save_graphviz_different_filename_output( + file_name, + format, + interactive, + fitted_tree_estimators, + tmpdir, + has_minimal_dependencies, +): if not has_minimal_dependencies: est_class, _ = fitted_tree_estimators src = visualize_decision_tree(estimator=est_class, filled=True, max_depth=3) - filepath = os.path.join(str(tmpdir), f'{file_name}') - no_output_ = save_plot(fig=src, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=src, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") + no_output_ = save_plot( + fig=src, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=src, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'example_plot.png' + assert os.path.basename(output_) == "example_plot.png" -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('test_plot', 'png', False), - ('test_plot.png', 'png', False), - ('test_plot.', 'png', False), - ('test_plot.png', 'jpeg', False) - ]) -def test_save_matplotlib_default_format(file_name, format, interactive, fitted_tree_estimators, tmpdir): +@pytest.mark.parametrize( + "file_name,format,interactive", + [ + ("test_plot", "png", False), + ("test_plot.png", "png", False), + ("test_plot.", "png", False), + ("test_plot.png", "jpeg", False), + ], +) +def test_save_matplotlib_default_format( + file_name, format, interactive, fitted_tree_estimators, tmpdir +): plt = pytest.importorskip("matplotlib.pyplot") def setup_plt(): @@ -442,24 +673,45 @@ def setup_plt(): return fig_ fig = setup_plt() - filepath = os.path.join(str(tmpdir), f'{file_name}') - no_output_ = save_plot(fig=fig, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=fig, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") + no_output_ = save_plot( + fig=fig, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=fig, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.png' + assert os.path.basename(output_) == "test_plot.png" -@pytest.mark.parametrize("file_name,format,interactive", - [ - ('test_plot', 'png', False), - ('test_plot.png', 'png', False), - ('test_plot.', 'png', False), - ('test_plot.png', 'jpeg', False) - ]) -def test_save_seaborn_default_format(file_name, format, interactive, fitted_tree_estimators, tmpdir, has_minimal_dependencies): +@pytest.mark.parametrize( + "file_name,format,interactive", + [ + ("test_plot", "png", False), + ("test_plot.png", "png", False), + ("test_plot.", "png", False), + ("test_plot.png", "jpeg", False), + ], +) +def test_save_seaborn_default_format( + file_name, + format, + interactive, + fitted_tree_estimators, + tmpdir, + has_minimal_dependencies, +): sns = pytest.importorskip("seaborn") def setup_plt(): @@ -468,14 +720,26 @@ def setup_plt(): return fig fig = setup_plt() - filepath = os.path.join(str(tmpdir), f'{file_name}') - no_output_ = save_plot(fig=fig, filepath=filepath, format=format, interactive=interactive, return_filepath=False) - output_ = save_plot(fig=fig, filepath=filepath, format=format, interactive=interactive, return_filepath=True) + filepath = os.path.join(str(tmpdir), f"{file_name}") + no_output_ = save_plot( + fig=fig, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=False, + ) + output_ = save_plot( + fig=fig, + filepath=filepath, + format=format, + interactive=interactive, + return_filepath=True, + ) assert not no_output_ assert os.path.exists(output_) assert isinstance(output_, str) - assert os.path.basename(output_) == 'test_plot.png' + assert os.path.basename(output_) == "test_plot.png" def test_deprecate_arg(): @@ -488,4 +752,6 @@ def test_deprecate_arg(): warnings.simplefilter("always") assert deprecate_arg("foo", "bar", 4, 7) == 4 assert len(warn) == 1 - assert str(warn[0].message).startswith("Argument 'foo' has been deprecated in favor of 'bar'") + assert str(warn[0].message).startswith( + "Argument 'foo' has been deprecated in favor of 'bar'" + ) diff --git a/evalml/tests/utils_tests/test_logger.py b/evalml/tests/utils_tests/test_logger.py index 815ac59783..042d6350f3 100644 --- a/evalml/tests/utils_tests/test_logger.py +++ b/evalml/tests/utils_tests/test_logger.py @@ -7,14 +7,9 @@ import pytest from evalml import AutoMLSearch -from evalml.utils.logger import ( - get_logger, - log_subtitle, - log_title, - time_elapsed -) +from evalml.utils.logger import get_logger, log_subtitle, log_title, time_elapsed -TEST_LOGGER_NAME = 'my_logger' +TEST_LOGGER_NAME = "my_logger" @pytest.fixture() @@ -23,9 +18,9 @@ def logger_env_cleanup(monkeypatch): if TEST_LOGGER_NAME in logging.Logger.manager.loggerDict: del logging.Logger.manager.loggerDict[TEST_LOGGER_NAME] # clean up any patches to the logger env var - assert os.environ.get('EVALML_LOG_FILE') is None + assert os.environ.get("EVALML_LOG_FILE") is None yield - monkeypatch.delenv('EVALML_LOG_FILE', raising=False) + monkeypatch.delenv("EVALML_LOG_FILE", raising=False) def test_get_logger(logger_env_cleanup, capsys, caplog): @@ -79,9 +74,11 @@ def test_logger_critical(caplog, logger_env_cleanup): assert "CRITICAL" in caplog.text -@patch('evalml.utils.logger.RotatingFileHandler') -def test_get_logger_default(mock_RotatingFileHandler, capsys, caplog, logger_env_cleanup): - assert os.environ.get('EVALML_LOG_FILE') is None +@patch("evalml.utils.logger.RotatingFileHandler") +def test_get_logger_default( + mock_RotatingFileHandler, capsys, caplog, logger_env_cleanup +): + assert os.environ.get("EVALML_LOG_FILE") is None logger = get_logger(TEST_LOGGER_NAME) assert len(logger.handlers) == 2 mock_RotatingFileHandler.assert_called_with(filename=Path("evalml_debug.log")) @@ -97,14 +94,16 @@ def test_get_logger_default(mock_RotatingFileHandler, capsys, caplog, logger_env assert "Exception encountered while setting up debug log file" not in stdouterr.err -@patch('evalml.utils.logger.RotatingFileHandler') -def test_get_logger_path_valid(mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup): - assert os.environ.get('EVALML_LOG_FILE') is None +@patch("evalml.utils.logger.RotatingFileHandler") +def test_get_logger_path_valid( + mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup +): + assert os.environ.get("EVALML_LOG_FILE") is None with tempfile.TemporaryDirectory() as temp_dir: - log_file_path = str(Path(temp_dir, 'evalml_debug_custom.log')) - monkeypatch.setenv('EVALML_LOG_FILE', log_file_path) - assert os.environ.get('EVALML_LOG_FILE') == log_file_path + log_file_path = str(Path(temp_dir, "evalml_debug_custom.log")) + monkeypatch.setenv("EVALML_LOG_FILE", log_file_path) + assert os.environ.get("EVALML_LOG_FILE") == log_file_path logger = get_logger(TEST_LOGGER_NAME) assert len(logger.handlers) == 2 @@ -114,21 +113,27 @@ def test_get_logger_path_valid(mock_RotatingFileHandler, monkeypatch, capsys, ca stdouterr = capsys.readouterr() assert "Warning: cannot write debug logs" not in caplog.text - assert"Exception encountered while setting up debug log file" not in caplog.text + assert "Exception encountered while setting up debug log file" not in caplog.text assert "Warning: cannot write debug logs" not in stdouterr.out - assert"Exception encountered while setting up debug log file" not in stdouterr.out + assert "Exception encountered while setting up debug log file" not in stdouterr.out assert "Warning: cannot write debug logs" not in stdouterr.err - assert"Exception encountered while setting up debug log file" not in stdouterr.err + assert "Exception encountered while setting up debug log file" not in stdouterr.err -@patch('evalml.utils.logger.RotatingFileHandler') -def test_get_logger_path_invalid(mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup): - assert os.environ.get('EVALML_LOG_FILE') is None +@patch("evalml.utils.logger.RotatingFileHandler") +def test_get_logger_path_invalid( + mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup +): + assert os.environ.get("EVALML_LOG_FILE") is None with tempfile.TemporaryDirectory() as temp_dir: - log_file_path = str(Path(temp_dir, 'INVALID', 'PATH', 'DOES_NOT_EXIST', 'evalml_debug_custom.log')) - monkeypatch.setenv('EVALML_LOG_FILE', log_file_path) - assert os.environ.get('EVALML_LOG_FILE') == log_file_path + log_file_path = str( + Path( + temp_dir, "INVALID", "PATH", "DOES_NOT_EXIST", "evalml_debug_custom.log" + ) + ) + monkeypatch.setenv("EVALML_LOG_FILE", log_file_path) + assert os.environ.get("EVALML_LOG_FILE") == log_file_path logger = get_logger(TEST_LOGGER_NAME) assert len(logger.handlers) == 1 @@ -144,13 +149,15 @@ def test_get_logger_path_invalid(mock_RotatingFileHandler, monkeypatch, capsys, assert "Exception encountered while setting up debug log file" not in stdouterr.err -@patch('evalml.utils.logger.RotatingFileHandler') -def test_get_logger_path_valid_but_dir(mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup): - assert os.environ.get('EVALML_LOG_FILE') is None +@patch("evalml.utils.logger.RotatingFileHandler") +def test_get_logger_path_valid_but_dir( + mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup +): + assert os.environ.get("EVALML_LOG_FILE") is None with tempfile.TemporaryDirectory() as temp_dir: - monkeypatch.setenv('EVALML_LOG_FILE', temp_dir) - assert os.environ.get('EVALML_LOG_FILE') == temp_dir + monkeypatch.setenv("EVALML_LOG_FILE", temp_dir) + assert os.environ.get("EVALML_LOG_FILE") == temp_dir logger = get_logger(TEST_LOGGER_NAME) assert len(logger.handlers) == 1 @@ -166,12 +173,14 @@ def test_get_logger_path_valid_but_dir(mock_RotatingFileHandler, monkeypatch, ca assert "Exception encountered while setting up debug log file" not in stdouterr.err -@patch('evalml.utils.logger.RotatingFileHandler') -def test_get_logger_path_empty(mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup): - assert os.environ.get('EVALML_LOG_FILE') is None +@patch("evalml.utils.logger.RotatingFileHandler") +def test_get_logger_path_empty( + mock_RotatingFileHandler, monkeypatch, capsys, caplog, logger_env_cleanup +): + assert os.environ.get("EVALML_LOG_FILE") is None - monkeypatch.setenv('EVALML_LOG_FILE', '') - assert os.environ.get('EVALML_LOG_FILE') == '' + monkeypatch.setenv("EVALML_LOG_FILE", "") + assert os.environ.get("EVALML_LOG_FILE") == "" logger = get_logger(TEST_LOGGER_NAME) assert len(logger.handlers) == 1 @@ -187,10 +196,12 @@ def test_get_logger_path_empty(mock_RotatingFileHandler, monkeypatch, capsys, ca assert "Exception encountered while setting up debug log file" not in stdouterr.err -@patch('evalml.utils.logger.RotatingFileHandler') -def test_get_logger_exception(mock_RotatingFileHandler, capsys, caplog, logger_env_cleanup): - mock_RotatingFileHandler.side_effect = Exception('all your log are belong to us') - assert os.environ.get('EVALML_LOG_FILE') is None +@patch("evalml.utils.logger.RotatingFileHandler") +def test_get_logger_exception( + mock_RotatingFileHandler, capsys, caplog, logger_env_cleanup +): + mock_RotatingFileHandler.side_effect = Exception("all your log are belong to us") + assert os.environ.get("EVALML_LOG_FILE") is None logger = get_logger(TEST_LOGGER_NAME) assert len(logger.handlers) == 1 assert len(mock_RotatingFileHandler.mock_calls) == 1 @@ -205,7 +216,10 @@ def test_get_logger_exception(mock_RotatingFileHandler, capsys, caplog, logger_e assert "Exception encountered while setting up debug log file" not in stdouterr.err -@pytest.mark.parametrize("time_passed,answer", [(101199, "28:06:39"), (3660, "1:01:00"), (65, "01:05"), (7, "00:07")]) +@pytest.mark.parametrize( + "time_passed,answer", + [(101199, "28:06:39"), (3660, "1:01:00"), (65, "01:05"), (7, "00:07")], +) @patch("time.time") def test_time_elapsed(mock_time, time_passed, answer): mock_time.return_value = time_passed @@ -213,11 +227,24 @@ def test_time_elapsed(mock_time, time_passed, answer): assert time == answer -@pytest.mark.parametrize("type_, allowed_families, number_, number_min_dep", [("binary", None, 8, 5), ("multiclass", 2, 2, 2), ("regression", 3, 3, 3)]) -def test_pipeline_count(type_, allowed_families, number_, number_min_dep, X_y_binary, X_y_multi, X_y_regression, caplog, has_minimal_dependencies): - if type_ == 'binary': +@pytest.mark.parametrize( + "type_, allowed_families, number_, number_min_dep", + [("binary", None, 8, 5), ("multiclass", 2, 2, 2), ("regression", 3, 3, 3)], +) +def test_pipeline_count( + type_, + allowed_families, + number_, + number_min_dep, + X_y_binary, + X_y_multi, + X_y_regression, + caplog, + has_minimal_dependencies, +): + if type_ == "binary": X, y = X_y_binary - elif type_ == 'multiclass': + elif type_ == "multiclass": X, y = X_y_multi else: X, y = X_y_regression @@ -225,9 +252,23 @@ def test_pipeline_count(type_, allowed_families, number_, number_min_dep, X_y_bi _ = AutoMLSearch(X_train=X, y_train=y, problem_type=type_) else: if allowed_families == 2: - _ = AutoMLSearch(X_train=X, y_train=y, problem_type=type_, allowed_model_families=['random_forest', 'decision_tree']) + _ = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=type_, + allowed_model_families=["random_forest", "decision_tree"], + ) elif allowed_families == 3: - _ = AutoMLSearch(X_train=X, y_train=y, problem_type=type_, allowed_model_families=['random_forest', 'decision_tree', 'extra_trees']) + _ = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=type_, + allowed_model_families=[ + "random_forest", + "decision_tree", + "extra_trees", + ], + ) if has_minimal_dependencies: assert f"{number_min_dep} pipelines ready for search" in caplog.text else: diff --git a/evalml/tests/utils_tests/test_woodwork_utils.py b/evalml/tests/utils_tests/test_woodwork_utils.py index 50bd09fd78..769634c137 100644 --- a/evalml/tests/utils_tests/test_woodwork_utils.py +++ b/evalml/tests/utils_tests/test_woodwork_utils.py @@ -14,8 +14,9 @@ def test_infer_feature_types_no_type_change(): X_dc = ww.init_series(pd.Series([1, 2, 3, 4])) pd.testing.assert_series_equal(X_dc, infer_feature_types(X_dc)) - X_pd = pd.DataFrame({0: pd.Series([1, 2], dtype="int64"), - 1: pd.Series([3, 4], dtype="int64")}) + X_pd = pd.DataFrame( + {0: pd.Series([1, 2], dtype="int64"), 1: pd.Series([3, 4], dtype="int64")} + ) pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd)) X_list = [1, 2, 3, 4] @@ -44,18 +45,22 @@ def test_infer_feature_types_series_name(): def test_infer_feature_types_dataframe(): - X_pd = pd.DataFrame({0: pd.Series([1, 2]), - 1: pd.Series([3, 4])}) + X_pd = pd.DataFrame({0: pd.Series([1, 2]), 1: pd.Series([3, 4])}) pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd), check_dtype=False) - X_pd = pd.DataFrame({0: pd.Series([1, 2], dtype="int64"), - 1: pd.Series([3, 4], dtype="int64")}) + X_pd = pd.DataFrame( + {0: pd.Series([1, 2], dtype="int64"), 1: pd.Series([3, 4], dtype="int64")} + ) pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd)) X_expected = X_pd.copy() X_expected[0] = X_expected[0].astype("category") - pd.testing.assert_frame_equal(X_expected, infer_feature_types(X_pd, {0: "categorical"})) - pd.testing.assert_frame_equal(X_expected, infer_feature_types(X_pd, {0: ww.logical_types.Categorical})) + pd.testing.assert_frame_equal( + X_expected, infer_feature_types(X_pd, {0: "categorical"}) + ) + pd.testing.assert_frame_equal( + X_expected, infer_feature_types(X_pd, {0: ww.logical_types.Categorical}) + ) def test_infer_feature_types_series(): @@ -72,15 +77,22 @@ def test_infer_feature_types_series(): X_pd = pd.Series([1, 2, 3, 4], dtype="int64") X_expected = X_pd.astype("category") - pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, ww.logical_types.Categorical)) - - -@pytest.mark.parametrize("value,error", - [ - (1, False), (-1, False), - (2.3, False), (None, True), - (np.nan, True), ("hello", True) - ]) + pd.testing.assert_series_equal( + X_expected, infer_feature_types(X_pd, ww.logical_types.Categorical) + ) + + +@pytest.mark.parametrize( + "value,error", + [ + (1, False), + (-1, False), + (2.3, False), + (None, True), + (np.nan, True), + ("hello", True), + ], +) @pytest.mark.parametrize("datatype", ["np", "pd", "ww"]) def test_convert_numeric_dataset_pandas(datatype, value, error, make_data_type): if datatype == "np" and value == "hello": @@ -92,7 +104,9 @@ def test_convert_numeric_dataset_pandas(datatype, value, error, make_data_type): y = make_data_type(datatype, y) if error: - with pytest.raises(ValueError, match="Values not all numeric or there are null"): + with pytest.raises( + ValueError, match="Values not all numeric or there are null" + ): _convert_numeric_dataset_pandas(X, y) else: X_transformed, y_transformed = _convert_numeric_dataset_pandas(X, y) @@ -105,39 +119,58 @@ def test_convert_numeric_dataset_pandas(datatype, value, error, make_data_type): def test_infer_feature_types_value_error(): - df = pd.DataFrame({"a": pd.Series([1, 2, 3]), - "b": pd.Series([4, 5, 6]), - "c": pd.Series([True, False, True])}) + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3]), + "b": pd.Series([4, 5, 6]), + "c": pd.Series([True, False, True]), + } + ) df.ww.init(logical_types={"a": "IntegerNullable", "c": "BooleanNullable"}) msg = "These are the columns with nullable types: \\[\\('a', 'Int64'\\), \\('c', 'boolean'\\)\\]" with pytest.raises(ValueError, match=msg): infer_feature_types(df) - y = pd.Series([1, 2, 3], name='series') + y = pd.Series([1, 2, 3], name="series") y = ww.init_series(y, logical_type="IntegerNullable") - with pytest.raises(ValueError, match="These are the columns with nullable types: \\[\\('series', 'Int64'\\)]"): + with pytest.raises( + ValueError, + match="These are the columns with nullable types: \\[\\('series', 'Int64'\\)]", + ): infer_feature_types(y) - df = pd.DataFrame({"A": pd.Series([4, 5, 6], dtype='Float64'), "b": [1, 2, 3]}) - with pytest.raises(ValueError, match="These are the columns with nullable types: \\[\\('A', 'Float64'\\)]"): + df = pd.DataFrame({"A": pd.Series([4, 5, 6], dtype="Float64"), "b": [1, 2, 3]}) + with pytest.raises( + ValueError, + match="These are the columns with nullable types: \\[\\('A', 'Float64'\\)]", + ): infer_feature_types(df) def test_infer_feature_types_preserves_semantic_tags(): - df = pd.DataFrame({"a": pd.Series([1, 2, 3]), - "b": pd.Series([4, 5, 6]), - "c": pd.Series([True, False, True]), - "my_index": [1, 2, 3], - "time_index": ["2020-01-01", "2020-01-02", "2020-01-03"]}) - df.ww.init(logical_types={"a": "Integer", "c": "Categorical", "b": "Double"}, - semantic_tags={"a": "My Integer", "c": "My Categorical", "b": "My Double"}, - index='my_index', time_index='time_index') + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3]), + "b": pd.Series([4, 5, 6]), + "c": pd.Series([True, False, True]), + "my_index": [1, 2, 3], + "time_index": ["2020-01-01", "2020-01-02", "2020-01-03"], + } + ) + df.ww.init( + logical_types={"a": "Integer", "c": "Categorical", "b": "Double"}, + semantic_tags={"a": "My Integer", "c": "My Categorical", "b": "My Double"}, + index="my_index", + time_index="time_index", + ) new_df = infer_feature_types(df) assert new_df.ww.schema == df.ww.schema - series = pd.Series([1, 2, 3], name='target') - series.ww.init(logical_type="Integer", semantic_tags=["Cool Series"], description="Great data") + series = pd.Series([1, 2, 3], name="target") + series.ww.init( + logical_type="Integer", semantic_tags=["Cool Series"], description="Great data" + ) assert series.ww.schema == infer_feature_types(series).ww.schema diff --git a/evalml/tuners/grid_search_tuner.py b/evalml/tuners/grid_search_tuner.py index 4b078fe1a6..4a7355eb77 100644 --- a/evalml/tuners/grid_search_tuner.py +++ b/evalml/tuners/grid_search_tuner.py @@ -17,7 +17,7 @@ class GridSearchTuner(Tuner): """ def __init__(self, pipeline_hyperparameter_ranges, n_points=10, random_seed=0): - """ Generate all of the possible points to search for in the grid + """Generate all of the possible points to search for in the grid Arguments: pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters @@ -32,7 +32,11 @@ def __init__(self, pipeline_hyperparameter_ranges, n_points=10, random_seed=0): if isinstance(dimension, list): range_values = dimension elif isinstance(dimension, (Real, Integer, tuple)): - if isinstance(dimension, (tuple)) and isinstance(dimension[0], (int, float)) and isinstance(dimension[1], (int, float)): + if ( + isinstance(dimension, (tuple)) + and isinstance(dimension[0], (int, float)) + and isinstance(dimension[1], (int, float)) + ): if dimension[1] > dimension[0]: low = dimension[0] high = dimension[1] @@ -90,5 +94,7 @@ def is_search_space_exhausted(self): self.curr_params = next(self._grid_points) return False except StopIteration: - raise NoParamsException("Grid search has exhausted all possible parameters.") + raise NoParamsException( + "Grid search has exhausted all possible parameters." + ) return True diff --git a/evalml/tuners/random_search_tuner.py b/evalml/tuners/random_search_tuner.py index 0c8b4bf23c..7c974a8098 100644 --- a/evalml/tuners/random_search_tuner.py +++ b/evalml/tuners/random_search_tuner.py @@ -14,8 +14,14 @@ class RandomSearchTuner(Tuner): >>> assert proposal['My Component'] == {'param a': 3.7454011884736254, 'param b': 'c'} """ - def __init__(self, pipeline_hyperparameter_ranges, random_seed=0, with_replacement=False, replacement_max_attempts=10): - """ Sets up check for duplication if needed. + def __init__( + self, + pipeline_hyperparameter_ranges, + random_seed=0, + with_replacement=False, + replacement_max_attempts=10, + ): + """Sets up check for duplication if needed. Arguments: pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters @@ -84,7 +90,9 @@ def is_search_space_exhausted(self): attempts = 0 while curr_params in self._used_parameters: if attempts >= self._replacement_max_attempts: - raise NoParamsException("Cannot create a unique set of unexplored parameters. Try expanding the search space.") + raise NoParamsException( + "Cannot create a unique set of unexplored parameters. Try expanding the search space." + ) return True attempts += 1 curr_params = self._get_sample() diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index 3b5bd46e1d..d6660758b8 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -22,7 +22,12 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): random_seed (int): The seed for the random number generator. Defaults to 0. """ super().__init__(pipeline_hyperparameter_ranges, random_seed=random_seed) - self.opt = Optimizer(self._search_space_ranges, "ET", acq_optimizer="sampling", random_state=random_seed) + self.opt = Optimizer( + self._search_space_ranges, + "ET", + acq_optimizer="sampling", + random_state=random_seed, + ) def add(self, pipeline_parameters, score): """Add score to sample @@ -41,14 +46,18 @@ def add(self, pipeline_parameters, score): try: self.opt.tell(flat_parameter_values, score) except Exception as e: - logger.debug('SKOpt tuner received error during add. Score: {}\nParameters: {}\nFlat parameter values: {}\nError: {}' - .format(pipeline_parameters, score, flat_parameter_values, e)) + logger.debug( + "SKOpt tuner received error during add. Score: {}\nParameters: {}\nFlat parameter values: {}\nError: {}".format( + pipeline_parameters, score, flat_parameter_values, e + ) + ) if str(e) == "'<=' not supported between instances of 'int' and 'NoneType'": - msg = "Invalid parameters specified to SKOptTuner.add: parameters {} error {}" \ - .format(pipeline_parameters, str(e)) + msg = "Invalid parameters specified to SKOptTuner.add: parameters {} error {}".format( + pipeline_parameters, str(e) + ) logger.error(msg) raise ParameterError(msg) - raise(e) + raise (e) def propose(self): """Returns a suggested set of parameters to train and score a pipeline with, based off the search space dimensions and prior samples. @@ -57,7 +66,7 @@ def propose(self): dict: Proposed pipeline parameters """ with warnings.catch_warnings(): - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") if not len(self._search_space_ranges): return self._convert_to_pipeline_parameters({}) flat_parameters = self.opt.ask() diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 21b61e932f..bf84602eac 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -21,19 +21,36 @@ def __init__(self, pipeline_hyperparameter_ranges, random_seed=0): self._search_space_names = [] self._search_space_ranges = [] if not isinstance(pipeline_hyperparameter_ranges, dict): - raise ValueError('pipeline_hyperparameter_ranges must be a dict but is of type {}'.format(type(pipeline_hyperparameter_ranges))) + raise ValueError( + "pipeline_hyperparameter_ranges must be a dict but is of type {}".format( + type(pipeline_hyperparameter_ranges) + ) + ) self._component_names = list(pipeline_hyperparameter_ranges.keys()) for component_name, component_ranges in pipeline_hyperparameter_ranges.items(): if not isinstance(component_ranges, dict): - raise ValueError('pipeline_hyperparameter_ranges has invalid entry for {}: {}'.format(component_name, component_ranges)) + raise ValueError( + "pipeline_hyperparameter_ranges has invalid entry for {}: {}".format( + component_name, component_ranges + ) + ) for parameter_name, parameter_range in component_ranges.items(): if parameter_range is None: - raise ValueError('pipeline_hyperparameter_ranges has invalid dimensions for ' + - '{} parameter {}: None.'.format(component_name, parameter_name)) - if not isinstance(parameter_range, (Real, Integer, Categorical, list, tuple)): + raise ValueError( + "pipeline_hyperparameter_ranges has invalid dimensions for " + + "{} parameter {}: None.".format( + component_name, parameter_name + ) + ) + if not isinstance( + parameter_range, (Real, Integer, Categorical, list, tuple) + ): continue - flat_parameter_name = '{}: {}'.format(component_name, parameter_name) - self._parameter_names_map[flat_parameter_name] = (component_name, parameter_name) + flat_parameter_name = "{}: {}".format(component_name, parameter_name) + self._parameter_names_map[flat_parameter_name] = ( + component_name, + parameter_name, + ) self._search_space_names.append(flat_parameter_name) self._search_space_ranges.append(parameter_range) @@ -41,17 +58,34 @@ def _convert_to_flat_parameters(self, pipeline_parameters): """Convert from pipeline parameters to a flat list of values""" flat_parameter_values = [] for flat_parameter_name in self._search_space_names: - component_name, parameter_name = self._parameter_names_map[flat_parameter_name] - if component_name not in pipeline_parameters or parameter_name not in pipeline_parameters[component_name]: - raise TypeError('Pipeline parameters missing required field "{}" for component "{}"'.format(parameter_name, component_name)) - flat_parameter_values.append(pipeline_parameters[component_name][parameter_name]) + component_name, parameter_name = self._parameter_names_map[ + flat_parameter_name + ] + if ( + component_name not in pipeline_parameters + or parameter_name not in pipeline_parameters[component_name] + ): + raise TypeError( + 'Pipeline parameters missing required field "{}" for component "{}"'.format( + parameter_name, component_name + ) + ) + flat_parameter_values.append( + pipeline_parameters[component_name][parameter_name] + ) return flat_parameter_values def _convert_to_pipeline_parameters(self, flat_parameters): """Convert from a flat list of values to a dict of pipeline parameters""" - pipeline_parameters = {component_name: dict() for component_name in self._component_names} - for flat_parameter_name, parameter_value in zip(self._search_space_names, flat_parameters): - component_name, parameter_name = self._parameter_names_map[flat_parameter_name] + pipeline_parameters = { + component_name: dict() for component_name in self._component_names + } + for flat_parameter_name, parameter_value in zip( + self._search_space_names, flat_parameters + ): + component_name, parameter_name = self._parameter_names_map[ + flat_parameter_name + ] pipeline_parameters[component_name][parameter_name] = parameter_value return pipeline_parameters diff --git a/evalml/tuners/tuner_exceptions.py b/evalml/tuners/tuner_exceptions.py index a1fecf7101..ac7027c49c 100644 --- a/evalml/tuners/tuner_exceptions.py +++ b/evalml/tuners/tuner_exceptions.py @@ -1,8 +1,10 @@ class NoParamsException(Exception): """Raised when a tuner exhausts its search space and runs out of parameters to propose.""" + pass class ParameterError(Exception): """Raised when a tuner encounters an error with the parameters being used with it.""" + pass diff --git a/evalml/utils/__init__.py b/evalml/utils/__init__.py index 15822e8523..21e3508014 100644 --- a/evalml/utils/__init__.py +++ b/evalml/utils/__init__.py @@ -15,7 +15,7 @@ is_all_numeric, get_importable_subclasses, _rename_column_names_to_numeric, - deprecate_arg + deprecate_arg, ) from .cli_utils import ( get_evalml_root, @@ -23,7 +23,7 @@ get_sys_info, print_deps, print_info, - print_sys_info + print_sys_info, ) from .woodwork_utils import ( _retain_custom_types_and_initalize_woodwork, diff --git a/evalml/utils/base_meta.py b/evalml/utils/base_meta.py index 116e29ee79..3857350c20 100644 --- a/evalml/utils/base_meta.py +++ b/evalml/utils/base_meta.py @@ -1,5 +1,3 @@ - - from abc import ABCMeta from functools import wraps @@ -7,9 +5,9 @@ class BaseMeta(ABCMeta): """Metaclass that overrides creating a new component or pipeline by wrapping methods with validators and setters""" - FIT_METHODS = ['fit', 'fit_transform'] - METHODS_TO_CHECK = ['predict', 'predict_proba', 'transform', 'inverse_transform'] - PROPERTIES_TO_CHECK = ['feature_importance'] + FIT_METHODS = ["fit", "fit_transform"] + METHODS_TO_CHECK = ["predict", "predict_proba", "transform", "inverse_transform"] + PROPERTIES_TO_CHECK = ["feature_importance"] @classmethod def set_fit(cls, method): @@ -18,6 +16,7 @@ def _set_fit(self, X, y=None): return_value = method(self, X, y) self._is_fitted = True return return_value + return _set_fit def __new__(cls, name, bases, dct): @@ -28,8 +27,10 @@ def __new__(cls, name, bases, dct): dct[attribute] = cls.check_for_fit(dct[attribute]) if attribute in cls.PROPERTIES_TO_CHECK: property_orig = dct[attribute] - dct[attribute] = property(cls.check_for_fit(property_orig.__get__), - property_orig.__set__, - property_orig.__delattr__, - property_orig.__doc__) + dct[attribute] = property( + cls.check_for_fit(property_orig.__get__), + property_orig.__set__, + property_orig.__delattr__, + property_orig.__doc__, + ) return super().__new__(cls, name, bases, dct) diff --git a/evalml/utils/cli_utils.py b/evalml/utils/cli_utils.py index 93fc725abc..231cad2ae1 100644 --- a/evalml/utils/cli_utils.py +++ b/evalml/utils/cli_utils.py @@ -63,22 +63,28 @@ def get_sys_info(): """ blob = [] try: - (sysname, nodename, release, - version, machine, processor) = platform.uname() - blob.extend([ - ("python", '.'.join(map(str, sys.version_info))), - ("python-bits", struct.calcsize("P") * 8), - ("OS", "{sysname}".format(sysname=sysname)), - ("OS-release", "{release}".format(release=release)), - ("machine", "{machine}".format(machine=machine)), - ("processor", "{processor}".format(processor=processor)), - ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)), - ("LC_ALL", "{lc}".format(lc=os.environ.get('LC_ALL', "None"))), - ("LANG", "{lang}".format(lang=os.environ.get('LANG', "None"))), - ("LOCALE", '.'.join(map(str, locale.getlocale()))), - ("# of CPUS", "{cpus}".format(cpus=psutil.cpu_count())), - ("Available memory", "{memory}".format(memory=bytes2human(psutil.virtual_memory().available))) - ]) + (sysname, nodename, release, version, machine, processor) = platform.uname() + blob.extend( + [ + ("python", ".".join(map(str, sys.version_info))), + ("python-bits", struct.calcsize("P") * 8), + ("OS", "{sysname}".format(sysname=sysname)), + ("OS-release", "{release}".format(release=release)), + ("machine", "{machine}".format(machine=machine)), + ("processor", "{processor}".format(processor=processor)), + ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)), + ("LC_ALL", "{lc}".format(lc=os.environ.get("LC_ALL", "None"))), + ("LANG", "{lang}".format(lang=os.environ.get("LANG", "None"))), + ("LOCALE", ".".join(map(str, locale.getlocale()))), + ("# of CPUS", "{cpus}".format(cpus=psutil.cpu_count())), + ( + "Available memory", + "{memory}".format( + memory=bytes2human(psutil.virtual_memory().available) + ), + ), + ] + ) except (KeyError, ValueError): pass diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 81137033f8..9767c3ecd8 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -8,10 +8,7 @@ import pandas as pd from sklearn.utils import check_random_state -from evalml.exceptions import ( - EnsembleMissingPipelinesError, - MissingComponentError -) +from evalml.exceptions import EnsembleMissingPipelinesError, MissingComponentError from evalml.utils import get_logger logger = get_logger(__file__) @@ -31,13 +28,13 @@ def import_or_raise(library, error_msg=None, warning=False): except ImportError: if error_msg is None: error_msg = "" - msg = (f"Missing optional dependency '{library}'. Please use pip to install {library}. {error_msg}") + msg = f"Missing optional dependency '{library}'. Please use pip to install {library}. {error_msg}" if warning: warnings.warn(msg) else: raise ImportError(msg) except Exception as ex: - msg = (f"An exception occurred while trying to import `{library}`: {str(ex)}") + msg = f"An exception occurred while trying to import `{library}`: {str(ex)}" if warning: warnings.warn(msg) else: @@ -46,11 +43,11 @@ def import_or_raise(library, error_msg=None, warning=False): def convert_to_seconds(input_str): """Converts a string describing a length of time to its length in seconds.""" - hours = {'h', 'hr', 'hour', 'hours'} - minutes = {'m', 'min', 'minute', 'minutes'} - seconds = {'s', 'sec', 'second', 'seconds'} + hours = {"h", "hr", "hour", "hours"} + minutes = {"m", "min", "minute", "minutes"} + seconds = {"s", "sec", "second", "seconds"} value, unit = input_str.split() - if unit[-1] == 's' and len(unit) != 1: + if unit[-1] == "s" and len(unit) != 1: unit = unit[:-1] if unit in seconds: return float(value) @@ -59,14 +56,18 @@ def convert_to_seconds(input_str): elif unit in hours: return float(value) * 3600 else: - msg = "Invalid unit. Units must be hours, mins, or seconds. Received '{}'".format(unit) + msg = ( + "Invalid unit. Units must be hours, mins, or seconds. Received '{}'".format( + unit + ) + ) raise AssertionError(msg) # specifies the min and max values a seed to np.random.RandomState is allowed to take. # these limits were chosen to fit in the numpy.int32 datatype to avoid issues with 32-bit systems # see https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.RandomState.html -SEED_BOUNDS = namedtuple('SEED_BOUNDS', ('min_bound', 'max_bound'))(0, 2**31 - 1) +SEED_BOUNDS = namedtuple("SEED_BOUNDS", ("min_bound", "max_bound"))(0, 2 ** 31 - 1) def get_random_state(seed): @@ -75,12 +76,20 @@ def get_random_state(seed): Arguments: seed (None, int, np.random.RandomState object): seed to use to generate numpy.random.RandomState. Must be between SEED_BOUNDS.min_bound and SEED_BOUNDS.max_bound, inclusive. Otherwise, an exception will be thrown. """ - if isinstance(seed, (int, np.integer)) and (seed < SEED_BOUNDS.min_bound or SEED_BOUNDS.max_bound < seed): - raise ValueError('Seed "{}" is not in the range [{}, {}], inclusive'.format(seed, SEED_BOUNDS.min_bound, SEED_BOUNDS.max_bound)) + if isinstance(seed, (int, np.integer)) and ( + seed < SEED_BOUNDS.min_bound or SEED_BOUNDS.max_bound < seed + ): + raise ValueError( + 'Seed "{}" is not in the range [{}, {}], inclusive'.format( + seed, SEED_BOUNDS.min_bound, SEED_BOUNDS.max_bound + ) + ) return check_random_state(seed) -def get_random_seed(random_state, min_bound=SEED_BOUNDS.min_bound, max_bound=SEED_BOUNDS.max_bound): +def get_random_seed( + random_state, min_bound=SEED_BOUNDS.min_bound, max_bound=SEED_BOUNDS.max_bound +): """Given a numpy.random.RandomState object, generate an int representing a seed value for another random number generator. Or, if given an int, return that int. To protect against invalid input to a particular library's random number generator, if an int value is provided, and it is outside the bounds "[min_bound, max_bound)", the value will be projected into the range between the min_bound (inclusive) and max_bound (exclusive) using modular arithmetic. @@ -94,7 +103,11 @@ def get_random_seed(random_state, min_bound=SEED_BOUNDS.min_bound, max_bound=SEE int: seed for random number generator """ if not min_bound < max_bound: - raise ValueError("Provided min_bound {} is not less than max_bound {}".format(min_bound, max_bound)) + raise ValueError( + "Provided min_bound {} is not less than max_bound {}".format( + min_bound, max_bound + ) + ) if isinstance(random_state, np.random.RandomState): return random_state.randint(min_bound, max_bound) if random_state < min_bound or random_state >= max_bound: @@ -104,20 +117,20 @@ def get_random_seed(random_state, min_bound=SEED_BOUNDS.min_bound, max_bound=SEE class classproperty: """Allows function to be accessed as a class level property. - Example: - class LogisticRegressionBinaryPipeline(PipelineBase): - component_graph = ['Simple Imputer', 'Logistic Regression Classifier'] - - @classproperty - def summary(cls): - summary = "" - for component in cls.component_graph: - component = handle_component_class(component) - summary += component.name + " + " - return summary - - assert LogisticRegressionBinaryPipeline.summary == "Simple Imputer + Logistic Regression Classifier + " - assert LogisticRegressionBinaryPipeline().summary == "Simple Imputer + Logistic Regression Classifier + " + Example: + class LogisticRegressionBinaryPipeline(PipelineBase): + component_graph = ['Simple Imputer', 'Logistic Regression Classifier'] + + @classproperty + def summary(cls): + summary = "" + for component in cls.component_graph: + component = handle_component_class(component) + summary += component.name + " + " + return summary + + assert LogisticRegressionBinaryPipeline.summary == "Simple Imputer + Logistic Regression Classifier + " + assert LogisticRegressionBinaryPipeline().summary == "Simple Imputer + Logistic Regression Classifier + " """ def __init__(self, func): @@ -152,9 +165,16 @@ def _get_subclasses(base_class): return subclasses -_not_used_in_automl = {'BaselineClassifier', 'BaselineRegressor', 'TimeSeriesBaselineEstimator', - 'StackedEnsembleClassifier', 'StackedEnsembleRegressor', 'KNeighborsClassifier', - 'SVMClassifier', 'SVMRegressor'} +_not_used_in_automl = { + "BaselineClassifier", + "BaselineRegressor", + "TimeSeriesBaselineEstimator", + "StackedEnsembleClassifier", + "StackedEnsembleRegressor", + "KNeighborsClassifier", + "SVMClassifier", + "SVMRegressor", +} def get_importable_subclasses(base_class, used_in_automl=True): @@ -176,13 +196,15 @@ def get_importable_subclasses(base_class, used_in_automl=True): classes = [] for cls in all_classes: - if 'evalml.pipelines' not in cls.__module__: + if "evalml.pipelines" not in cls.__module__: continue try: cls() classes.append(cls) except (ImportError, MissingComponentError, TypeError): - logger.debug(f'Could not import class {cls.__name__} in get_importable_subclasses') + logger.debug( + f"Could not import class {cls.__name__} in get_importable_subclasses" + ) except EnsembleMissingPipelinesError: classes.append(cls) if used_in_automl: @@ -210,9 +232,13 @@ def _rename_column_names_to_numeric(X, flatten_tuples=True): if flatten_tuples and (len(X.columns) > 0 and isinstance(X.columns, pd.MultiIndex)): flat_col_names = list(map(str, X_renamed.columns)) X_renamed.columns = flat_col_names - rename_cols_dict = dict((str(col), col_num) for col_num, col in enumerate(list(X.columns))) + rename_cols_dict = dict( + (str(col), col_num) for col_num, col in enumerate(list(X.columns)) + ) else: - rename_cols_dict = dict((col, col_num) for col_num, col in enumerate(list(X.columns))) + rename_cols_dict = dict( + (col, col_num) for col_num, col in enumerate(list(X.columns)) + ) X_renamed.rename(columns=rename_cols_dict, inplace=True) return X_renamed @@ -244,7 +270,7 @@ def safe_repr(value): """ if isinstance(value, float): if pd.isna(value): - return 'np.nan' + return "np.nan" if np.isinf(value): return f"float('{repr(value)}')" return repr(value) @@ -280,13 +306,17 @@ def pad_with_nans(pd_data, num_to_pad): if isinstance(pd_data, pd.Series): padding = pd.Series([np.nan] * num_to_pad, name=pd_data.name) else: - padding = pd.DataFrame({col: [np.nan] * num_to_pad - for col in pd_data.columns}) + padding = pd.DataFrame({col: [np.nan] * num_to_pad for col in pd_data.columns}) padded = pd.concat([padding, pd_data], ignore_index=True) # By default, pd.concat will convert all types to object if there are mixed numerics and objects # The call to convert_dtypes ensures numerics stay numerics in the new dataframe. - return padded.convert_dtypes(infer_objects=True, convert_string=False, convert_floating=False, - convert_integer=False, convert_boolean=False) + return padded.convert_dtypes( + infer_objects=True, + convert_string=False, + convert_floating=False, + convert_integer=False, + convert_boolean=False, + ) def _get_rows_without_nans(*data): @@ -299,6 +329,7 @@ def _get_rows_without_nans(*data): np.ndarray: mask where each entry is True if and only if all corresponding entries in that index in data are non-nan. """ + def _not_nan(pd_data): if pd_data is None or len(pd_data) == 0: return np.array([True]) @@ -308,6 +339,7 @@ def _not_nan(pd_data): return ~pd_data.isna().any(axis=1).values else: return pd_data + mask = reduce(lambda a, b: np.logical_and(_not_nan(a), _not_nan(b)), data) return mask @@ -332,8 +364,8 @@ def _subset(pd_data): return [_subset(data) for data in pd_data] -def _file_path_check(filepath=None, format='png', interactive=False, is_plotly=False): - """ Helper function to check the filepath being passed. +def _file_path_check(filepath=None, format="png", interactive=False, is_plotly=False): + """Helper function to check the filepath being passed. Arguments: filepath (str or Path, optional): Location to save file. @@ -349,21 +381,25 @@ def _file_path_check(filepath=None, format='png', interactive=False, is_plotly=F path_and_name, extension = os.path.splitext(filepath) extension = extension[1:].lower() if extension else None if is_plotly and interactive: - format_ = 'html' + format_ = "html" elif not extension and not interactive: format_ = format else: format_ = extension - filepath = f'{path_and_name}.{format_}' + filepath = f"{path_and_name}.{format_}" try: - f = open(filepath, 'w') + f = open(filepath, "w") f.close() except (IOError, FileNotFoundError): - raise ValueError(('Specified filepath is not writeable: {}'.format(filepath))) + raise ValueError( + ("Specified filepath is not writeable: {}".format(filepath)) + ) return filepath -def save_plot(fig, filepath=None, format='png', interactive=False, return_filepath=False): +def save_plot( + fig, filepath=None, format="png", interactive=False, return_filepath=False +): """Saves fig to filepath if specified, or to a default location if not. Arguments: @@ -380,8 +416,12 @@ def save_plot(fig, filepath=None, format='png', interactive=False, return_filepa Defaults to None. """ plotly_ = import_or_raise("plotly", error_msg="Cannot find dependency plotly") - graphviz_ = import_or_raise('graphviz', error_msg='Please install graphviz to visualize trees.') - matplotlib = import_or_raise("matplotlib", error_msg="Cannot find dependency matplotlib") + graphviz_ = import_or_raise( + "graphviz", error_msg="Please install graphviz to visualize trees." + ) + matplotlib = import_or_raise( + "matplotlib", error_msg="Cannot find dependency matplotlib" + ) plt_ = matplotlib.pyplot axes_ = matplotlib.axes @@ -390,7 +430,7 @@ def save_plot(fig, filepath=None, format='png', interactive=False, return_filepa is_plt = False is_seaborn = False - format = format if format else 'png' + format = format if format else "png" if isinstance(fig, plotly_.graph_objects.Figure): is_plotly = True elif isinstance(fig, graphviz_.Source): @@ -401,10 +441,12 @@ def save_plot(fig, filepath=None, format='png', interactive=False, return_filepa is_seaborn = True if not filepath: - extension = 'html' if interactive and is_plotly else format - filepath = os.path.join(os.getcwd(), f'test_plot.{extension}') + extension = "html" if interactive and is_plotly else format + filepath = os.path.join(os.getcwd(), f"test_plot.{extension}") - filepath = _file_path_check(filepath, format=format, interactive=interactive, is_plotly=is_plotly) + filepath = _file_path_check( + filepath, format=format, interactive=interactive, is_plotly=is_plotly + ) if is_plotly and interactive: fig.write_html(file=filepath) @@ -412,8 +454,8 @@ def save_plot(fig, filepath=None, format='png', interactive=False, return_filepa fig.write_image(file=filepath, engine="kaleido") elif is_graphviz: filepath_, format_ = os.path.splitext(filepath) - fig.format = 'png' - filepath = f'{filepath_}.png' + fig.format = "png" + filepath = f"{filepath_}.png" fig.render(filename=filepath_, view=False, cleanup=True) elif is_plt: fig.savefig(fname=filepath) @@ -439,7 +481,9 @@ def deprecate_arg(old_arg, new_arg, old_value, new_value): """ value_to_use = new_value if old_value is not None: - warnings.warn(f"Argument '{old_arg}' has been deprecated in favor of '{new_arg}'. " - f"Passing '{old_arg}' in future versions will result in an error.") + warnings.warn( + f"Argument '{old_arg}' has been deprecated in favor of '{new_arg}'. " + f"Passing '{old_arg}' in future versions will result in an error." + ) value_to_use = old_value return value_to_use diff --git a/evalml/utils/logger.py b/evalml/utils/logger.py index ecac1a128e..b5f2a6adcc 100644 --- a/evalml/utils/logger.py +++ b/evalml/utils/logger.py @@ -13,16 +13,19 @@ def get_logger(name): logger.setLevel(logging.DEBUG) stdout_handler = logging.StreamHandler(sys.stdout) stdout_handler.setLevel(logging.INFO) - stdout_handler.setFormatter(logging.Formatter('%(message)s')) + stdout_handler.setFormatter(logging.Formatter("%(message)s")) logger.addHandler(stdout_handler) - evalml_log_path_str = os.environ.get('EVALML_LOG_FILE', 'evalml_debug.log') + evalml_log_path_str = os.environ.get("EVALML_LOG_FILE", "evalml_debug.log") evalml_log_path = Path(evalml_log_path_str) - warning_msg = 'Continuing without logging to file. To change this, please set the EVALML_LOG_FILE environment variable to a valid file path with write permissions available. To disable debug logging, please set the EVALML_LOG_FILE environment variable to an empty value, or simply ignore this warning.' + warning_msg = "Continuing without logging to file. To change this, please set the EVALML_LOG_FILE environment variable to a valid file path with write permissions available. To disable debug logging, please set the EVALML_LOG_FILE environment variable to an empty value, or simply ignore this warning." if len(evalml_log_path_str) == 0: return logger if evalml_log_path.is_dir() or not os.access(evalml_log_path.parent, os.W_OK): - print(f'Warning: cannot write debug logs to path "{evalml_log_path}". ' + warning_msg) + print( + f'Warning: cannot write debug logs to path "{evalml_log_path}". ' + + warning_msg + ) return logger try: date_fmt = "%m/%d/%Y %I:%M:%S %p" @@ -32,8 +35,10 @@ def get_logger(name): log_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=date_fmt)) logger.addHandler(log_handler) except Exception as e: - logger.warning(f'Exception encountered while setting up debug log file at path {evalml_log_path}: {str(e)}') - logger.warning(''.join(traceback.format_tb(sys.exc_info()[2]))) + logger.warning( + f"Exception encountered while setting up debug log file at path {evalml_log_path}: {str(e)}" + ) + logger.warning("".join(traceback.format_tb(sys.exc_info()[2]))) logger.warning(warning_msg) return logger @@ -66,6 +71,6 @@ def time_elapsed(start_time): mins, s = divmod(int(time_diff), 60) h, m = divmod(mins, 60) if h: - return '{0:d}:{1:02d}:{2:02d}'.format(h, m, s) + return "{0:d}:{1:02d}:{2:02d}".format(h, m, s) else: - return '{0:02d}:{1:02d}'.format(m, s) + return "{0:02d}:{1:02d}".format(m, s) diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 5e620726e5..48f4d50e16 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -1,11 +1,14 @@ - import numpy as np import pandas as pd import woodwork as ww from evalml.utils.gen_utils import is_all_numeric -numeric_and_boolean_ww = [ww.logical_types.Integer, ww.logical_types.Double, ww.logical_types.Boolean] +numeric_and_boolean_ww = [ + ww.logical_types.Integer, + ww.logical_types.Double, + ww.logical_types.Boolean, +] def _numpy_to_pandas(array): @@ -20,18 +23,24 @@ def _list_to_pandas(list): return _numpy_to_pandas(np.array(list)) -_nullable_types = {'Int64', 'Float64', 'boolean'} +_nullable_types = {"Int64", "Float64", "boolean"} def _raise_value_error_if_nullable_types_detected(data): types = {data.name: data.dtype} if isinstance(data, pd.Series) else data.dtypes - cols_with_nullable_types = {col: str(ptype) for col, ptype in dict(types).items() if str(ptype) in _nullable_types} + cols_with_nullable_types = { + col: str(ptype) + for col, ptype in dict(types).items() + if str(ptype) in _nullable_types + } if cols_with_nullable_types: - raise ValueError("Evalml does not support the new pandas nullable types because " - "our dependencies (sklearn, xgboost, lightgbm) do not support them yet." - "If your data does not have missing values, please use the non-nullable types (bool, int64, float64). " - "If your data does have missing values, use float64 for int and float columns and category for boolean columns. " - f"These are the columns with nullable types: {list(cols_with_nullable_types.items())}") + raise ValueError( + "Evalml does not support the new pandas nullable types because " + "our dependencies (sklearn, xgboost, lightgbm) do not support them yet." + "If your data does not have missing values, please use the non-nullable types (bool, int64, float64). " + "If your data does have missing values, use float64 for int and float columns and category for boolean columns. " + f"These are the columns with nullable types: {list(cols_with_nullable_types.items())}" + ) def infer_feature_types(data, feature_types=None): @@ -55,7 +64,9 @@ def infer_feature_types(data, feature_types=None): _raise_value_error_if_nullable_types_detected(data) if data.ww.schema is not None: - if isinstance(data, pd.DataFrame) and not ww.is_schema_valid(data, data.ww.schema): + if isinstance(data, pd.DataFrame) and not ww.is_schema_valid( + data, data.ww.schema + ): raise ValueError(ww.get_invalid_schema_message(data, data.ww.schema)) data.ww.init(schema=data.ww.schema) return data @@ -68,7 +79,9 @@ def infer_feature_types(data, feature_types=None): return ww_data -def _retain_custom_types_and_initalize_woodwork(old_logical_types, new_dataframe, ltypes_to_ignore=None): +def _retain_custom_types_and_initalize_woodwork( + old_logical_types, new_dataframe, ltypes_to_ignore=None +): """ Helper method which will take an old Woodwork data structure and a new pandas data structure and return a new data structure that will try to retain as many logical types from the old data structure that exist in the new @@ -88,8 +101,14 @@ def _retain_custom_types_and_initalize_woodwork(old_logical_types, new_dataframe return ww.init_series(new_dataframe, old_logical_types) if ltypes_to_ignore is None: ltypes_to_ignore = [] - col_intersection = set(old_logical_types.keys()).intersection(set(new_dataframe.columns)) - retained_logical_types = {col: ltype for col, ltype in old_logical_types.items() if col in col_intersection and ltype not in ltypes_to_ignore} + col_intersection = set(old_logical_types.keys()).intersection( + set(new_dataframe.columns) + ) + retained_logical_types = { + col: ltype + for col, ltype in old_logical_types.items() + if col in col_intersection and ltype not in ltypes_to_ignore + } new_dataframe.ww.init(logical_types=retained_logical_types) return new_dataframe @@ -106,6 +125,8 @@ def _convert_numeric_dataset_pandas(X, y): Tuple(pd.DataFrame, pd.Series): Transformed X and y""" X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): - raise ValueError('Values not all numeric or there are null values provided in the dataset') + raise ValueError( + "Values not all numeric or there are null values provided in the dataset" + ) y_ww = infer_feature_types(y) return X_ww, y_ww From 88e3c72cbea4e9ba3df7ddd46f4cb3a39be1c0c8 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 14:43:04 -0400 Subject: [PATCH 48/85] lint fixes --- evalml/automl/callbacks.py | 4 +--- evalml/tests/automl_tests/test_automl.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/evalml/automl/callbacks.py b/evalml/automl/callbacks.py index d83b921659..1e2786acec 100644 --- a/evalml/automl/callbacks.py +++ b/evalml/automl/callbacks.py @@ -37,7 +37,5 @@ def log_error_callback(exception, traceback, automl, **kwargs): logger.info( f"\t\t\tFold {fold_num}: Exception during automl search: {str(exception)}" ) - logger.debug( - f"\t\t\tFold {fold_num}: Hyperparameters:\n\t{pipeline.parameters}" - ) + logger.debug(f"\t\t\tFold {fold_num}: Hyperparameters:\n\t{pipeline.parameters}") logger.debug(f"\t\t\tFold {fold_num}: Traceback:\n{trace}") diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 2cd946120c..8f19b60cf3 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -56,7 +56,6 @@ allowed_model_families, get_estimators, ) -from evalml.pipelines.components.utils import allowed_model_families, get_estimators from evalml.pipelines.utils import make_pipeline from evalml.preprocessing import TrainingValidationSplit from evalml.problem_types import ( From 62d36a1ba2910395aa95f4fe9aa531285b9fa46a Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 15:02:33 -0400 Subject: [PATCH 49/85] lint update --- evalml/data_checks/invalid_targets_data_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 4964e880ae..959d940bb2 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -10,11 +10,11 @@ ) from evalml.objectives import get_objective from evalml.problem_types import ( - ProblemTypes, handle_problem_types, is_binary, is_multiclass, is_regression, + ProblemTypes ) from evalml.utils.woodwork_utils import infer_feature_types, numeric_and_boolean_ww From 801f8c334c76af660226b39d3b85aec4029f332c Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 15:10:15 -0400 Subject: [PATCH 50/85] lint update --- evalml/data_checks/invalid_targets_data_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 959d940bb2..1c0a06c24a 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -14,7 +14,7 @@ is_binary, is_multiclass, is_regression, - ProblemTypes + ProblemTypes, ) from evalml.utils.woodwork_utils import infer_feature_types, numeric_and_boolean_ww From 718f7f5d60a0c2271c1e0b8393ce01f7ea6836ef Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 15:17:56 -0400 Subject: [PATCH 51/85] lint fix --- evalml/data_checks/invalid_targets_data_check.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 1c0a06c24a..2ee4189e29 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -10,13 +10,16 @@ ) from evalml.objectives import get_objective from evalml.problem_types import ( + ProblemTypes, handle_problem_types, is_binary, is_multiclass, - is_regression, - ProblemTypes, + is_regression +) +from evalml.utils.woodwork_utils import ( + infer_feature_types, + numeric_and_boolean_ww ) -from evalml.utils.woodwork_utils import infer_feature_types, numeric_and_boolean_ww class InvalidTargetDataCheck(DataCheck): From cff865fd38cd6454be1c927eeb79109314bceb33 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 15:33:13 -0400 Subject: [PATCH 52/85] lint fix --- evalml/data_checks/invalid_targets_data_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py index 2ee4189e29..6318ec560a 100644 --- a/evalml/data_checks/invalid_targets_data_check.py +++ b/evalml/data_checks/invalid_targets_data_check.py @@ -14,11 +14,11 @@ handle_problem_types, is_binary, is_multiclass, - is_regression + is_regression, ) from evalml.utils.woodwork_utils import ( infer_feature_types, - numeric_and_boolean_ww + numeric_and_boolean_ww, ) From 17a39e59e92fb4e32287d63b4e67331de8eccbc0 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 4 Jun 2021 16:23:08 -0400 Subject: [PATCH 53/85] initial commit --- evalml/automl/automl_search.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 2da50da6a6..380e2ac776 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -136,7 +136,7 @@ def __init__( patience=None, tolerance=None, data_splitter=None, - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=None, start_iteration_callback=None, add_result_callback=None, @@ -188,8 +188,8 @@ def __init__( tolerance (float): Minimum percentage difference to qualify as score improvement for early stopping. Only applicable if patience is not None. Defaults to None. - allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. - The default of None indicates all pipelines for this problem type are allowed. Setting this field will cause + allowed_component_graphs (list(str or ComponentBase) or dict): A list or dict of ComponentBase subclasses indicating the component graphs allowed in the search. + The default of None indicates all pipeline component graphs for this problem type are allowed. Setting this field will cause allowed_model_families to be ignored. allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all @@ -385,17 +385,17 @@ def __init__( "Unable to import plotly; skipping pipeline search plotting\n" ) - if allowed_pipelines is not None and not isinstance(allowed_pipelines, list): + if allowed_component_graphs is not None and not isinstance(allowed_component_graphs, list): raise ValueError( - "Parameter allowed_pipelines must be either None or a list!" + "Parameter allowed_component_graphs must be either None or a list!" ) - if allowed_pipelines is not None and not all( - isinstance(p, PipelineBase) for p in allowed_pipelines + if allowed_component_graphs is not None and not all( + isinstance(p, PipelineBase) for p in allowed_component_graphs ): raise ValueError( - "Every element of allowed_pipelines must an instance of PipelineBase!" + "Every element of allowed_component_graphs an instance of PipelineBase!" ) - self.allowed_pipelines = allowed_pipelines + self.allowed_component_graphs = allowed_component_graphs self.allowed_model_families = allowed_model_families self._automl_algorithm = None self._start = 0.0 @@ -467,7 +467,7 @@ def __init__( self._sampler_name ] - if self.allowed_pipelines is None: + if self.allowed_component_graphs is None: logger.info("Generating pipelines to search over...") allowed_estimators = get_estimators( self.problem_type, self.allowed_model_families @@ -497,10 +497,10 @@ def __init__( for estimator in allowed_estimators ] - if self.allowed_pipelines == []: - raise ValueError("No allowed pipelines to search") + if self.allowed_component_graphs == []: + raise ValueError("No allowed component graphs to search") - logger.info(f"{len(self.allowed_pipelines)} pipelines ready for search.") + logger.info(f"{len(self.allowed_component_graphs)} component graphs ready for search.") check_all_pipeline_names_unique(self.allowed_pipelines) run_ensembling = self.ensembling From 705855ce12bc0b6281c013f1ce9ef37c79825d04 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 7 Jun 2021 12:27:17 -0400 Subject: [PATCH 54/85] updates --- evalml/automl/automl_search.py | 27 +++++++++--------------- evalml/automl/utils.py | 20 +++++++++++++++++- evalml/tests/automl_tests/test_automl.py | 8 +++---- evalml/tests/conftest.py | 8 ++++++- 4 files changed, 40 insertions(+), 23 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 380e2ac776..c0472abeed 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -20,7 +20,7 @@ check_all_pipeline_names_unique, get_best_sampler_for_data, get_default_primary_search_objective, - make_data_splitter, + make_data_splitter, get_pipelines_from_component_graphs, ) from evalml.data_checks import DefaultDataChecks from evalml.exceptions import ( @@ -188,10 +188,13 @@ def __init__( tolerance (float): Minimum percentage difference to qualify as score improvement for early stopping. Only applicable if patience is not None. Defaults to None. - allowed_component_graphs (list(str or ComponentBase) or dict): A list or dict of ComponentBase subclasses indicating the component graphs allowed in the search. + allowed_component_graphs (list): A list of dictionaries indicating the component graphs allowed in the search. + The format should follow [ {Name_of_graph: [list_of_components]} ]. The default of None indicates all pipeline component graphs for this problem type are allowed. Setting this field will cause allowed_model_families to be ignored. + e.g. allowed_component_graphs = [ {"Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class]} ] + allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary` to `multiclass` or `regression` depending on the problem type. Note that if allowed_pipelines is provided, @@ -384,17 +387,11 @@ def __init__( logger.warning( "Unable to import plotly; skipping pipeline search plotting\n" ) - + print(allowed_component_graphs) if allowed_component_graphs is not None and not isinstance(allowed_component_graphs, list): raise ValueError( "Parameter allowed_component_graphs must be either None or a list!" ) - if allowed_component_graphs is not None and not all( - isinstance(p, PipelineBase) for p in allowed_component_graphs - ): - raise ValueError( - "Every element of allowed_component_graphs an instance of PipelineBase!" - ) self.allowed_component_graphs = allowed_component_graphs self.allowed_model_families = allowed_model_families self._automl_algorithm = None @@ -496,6 +493,10 @@ def __init__( ) for estimator in allowed_estimators ] + else: + print() + print(f'Automlsearch __ init __ - self.allowed_component_graphs: {self.allowed_component_graphs}') + self.allowed_pipelines = get_pipelines_from_component_graphs(self.allowed_component_graphs, self.problem_type) if self.allowed_component_graphs == []: raise ValueError("No allowed component graphs to search") @@ -927,14 +928,6 @@ def _validate_problem_type(self): ) ) - for pipeline in self.allowed_pipelines or []: - if pipeline.problem_type != self.problem_type: - raise ValueError( - "Given pipeline {} is not compatible with problem_type {}.".format( - pipeline.name, self.problem_type.value - ) - ) - def _get_baseline_pipeline(self): """Creates a baseline pipeline instance.""" if self.problem_type == ProblemTypes.BINARY: diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index d9a6c41b3e..38d4470b7a 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -5,7 +5,9 @@ from sklearn.model_selection import KFold, StratifiedKFold from evalml.objectives import get_objective -from evalml.pipelines import ComponentGraph +from evalml.pipelines import ComponentGraph, BinaryClassificationPipeline, MulticlassClassificationPipeline, \ + RegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, \ + TimeSeriesRegressionPipeline from evalml.preprocessing.data_splitters import ( TimeSeriesSplit, TrainingValidationSplit, @@ -219,3 +221,19 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): component_hyperparameters.update(custom_hyperparameters[component_name]) hyperparameter_ranges[component_name] = component_hyperparameters return hyperparameter_ranges + + +def get_pipelines_from_component_graphs(component_graphs_list, problem_type): + pipeline_class = { + ProblemTypes.BINARY: BinaryClassificationPipeline, + ProblemTypes.MULTICLASS: MulticlassClassificationPipeline, + ProblemTypes.REGRESSION: RegressionPipeline, + ProblemTypes.TIME_SERIES_BINARY: TimeSeriesBinaryClassificationPipeline, + ProblemTypes.TIME_SERIES_MULTICLASS: TimeSeriesMulticlassClassificationPipeline, + ProblemTypes.TIME_SERIES_REGRESSION: TimeSeriesRegressionPipeline + }[handle_problem_types(problem_type)] + allowed_pipelines = [] + for component_graph in component_graphs_list: + for comp_name, comp_graph in component_graph.items(): + allowed_pipelines.append(pipeline_class(component_graph=comp_graph, custom_name=comp_name)) + return allowed_pipelines diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 8f19b60cf3..679174209f 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1010,7 +1010,7 @@ def test_default_objective(X_y_binary): @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_add_to_rankings(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): +def test_add_to_rankings(mock_fit, mock_score, dummy_binary_component_graph, dummy_binary_pipeline_class, X_y_binary): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 1.0} @@ -1019,7 +1019,7 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_ y_train=y, problem_type="binary", max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[dummy_binary_component_graph] ) automl.search() assert len(automl.rankings) == 1 @@ -1054,7 +1054,7 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_ @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_no_search( - mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_binary_component_graph, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary automl = AutoMLSearch( @@ -1062,7 +1062,7 @@ def test_add_to_rankings_no_search( y_train=y, problem_type="binary", max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[dummy_binary_component_graph], ) mock_score.return_value = {"Log Loss Binary": 0.5234} diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 654af8aa4f..0d39be38dd 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -20,7 +20,7 @@ RegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, - TimeSeriesRegressionPipeline, + TimeSeriesRegressionPipeline, ComponentGraph, RandomForestClassifier, ) from evalml.pipelines.components import ( DecisionTreeClassifier, @@ -347,6 +347,12 @@ def fit(self, X, y): return MockEstimator +@pytest.fixture +def dummy_binary_component_graph(dummy_classifier_estimator_class): + component_graph = {"Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class]} + return component_graph + + @pytest.fixture def dummy_binary_pipeline_class(dummy_classifier_estimator_class): MockEstimator = dummy_classifier_estimator_class From aa461046fcbce4ab690fb5fe7818401ca08f7a66 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 8 Jun 2021 13:53:38 -0400 Subject: [PATCH 55/85] add docstring --- evalml/automl/utils.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 38d4470b7a..4860eb6e2d 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -224,6 +224,16 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): def get_pipelines_from_component_graphs(component_graphs_list, problem_type): + """ + Returns created pipelines from passed component graphs based on the specified problem type. + + Arguments: + component_graphs_list (list): The list of component graphs. + problem_type (str or ProblemType): The problem type for which pipelines will be created. + + Returns: + list: List of pipelines made from the passed component graphs. + """ pipeline_class = { ProblemTypes.BINARY: BinaryClassificationPipeline, ProblemTypes.MULTICLASS: MulticlassClassificationPipeline, @@ -232,8 +242,8 @@ def get_pipelines_from_component_graphs(component_graphs_list, problem_type): ProblemTypes.TIME_SERIES_MULTICLASS: TimeSeriesMulticlassClassificationPipeline, ProblemTypes.TIME_SERIES_REGRESSION: TimeSeriesRegressionPipeline }[handle_problem_types(problem_type)] - allowed_pipelines = [] + created_pipelines = [] for component_graph in component_graphs_list: for comp_name, comp_graph in component_graph.items(): - allowed_pipelines.append(pipeline_class(component_graph=comp_graph, custom_name=comp_name)) - return allowed_pipelines + created_pipelines.append(pipeline_class(component_graph=comp_graph, custom_name=comp_name)) + return created_pipelines From 84ce2aa43bfbae81c90701e81f3639ac1dc0b826 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 8 Jun 2021 14:33:14 -0400 Subject: [PATCH 56/85] update api ref --- docs/source/api_reference.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index b5b487fb59..fd070c7cca 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -257,6 +257,7 @@ Regressors are components that output a predicted target value. DecisionTreeRegressor LightGBMRegressor SVMRegressor + ProphetRegressor .. currentmodule:: evalml.model_understanding From 5d8ae24c94c3cad322b1f16a5737d49765fa7c34 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 8 Jun 2021 14:33:32 -0400 Subject: [PATCH 57/85] update api ref --- docs/source/api_reference.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index fd070c7cca..b5b487fb59 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -257,7 +257,6 @@ Regressors are components that output a predicted target value. DecisionTreeRegressor LightGBMRegressor SVMRegressor - ProphetRegressor .. currentmodule:: evalml.model_understanding From 5d3aec2606fe52c0d92b35318fdcd3820c00413f Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 8 Jun 2021 23:24:52 -0400 Subject: [PATCH 58/85] test updates --- evalml/automl/automl_search.py | 11 +- evalml/automl/utils.py | 8 +- evalml/tests/automl_tests/test_automl.py | 283 +++++++----------- .../tests/automl_tests/test_automl_utils.py | 4 + evalml/tests/conftest.py | 32 +- 5 files changed, 147 insertions(+), 191 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index c0472abeed..fb283488b6 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -494,14 +494,15 @@ def __init__( for estimator in allowed_estimators ] else: - print() - print(f'Automlsearch __ init __ - self.allowed_component_graphs: {self.allowed_component_graphs}') self.allowed_pipelines = get_pipelines_from_component_graphs(self.allowed_component_graphs, self.problem_type) + print("AutoMLSearch - init - self.allowed_pipelines") + print('---------------------------------------------') + print(self.allowed_pipelines) - if self.allowed_component_graphs == []: - raise ValueError("No allowed component graphs to search") + if self.allowed_pipelines == []: + raise ValueError("No allowed pipelines to search") - logger.info(f"{len(self.allowed_component_graphs)} component graphs ready for search.") + logger.info(f"{len(self.allowed_pipelines)} component graphs ready for search.") check_all_pipeline_names_unique(self.allowed_pipelines) run_ensembling = self.ensembling diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 4860eb6e2d..568e25ecc6 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -244,6 +244,10 @@ def get_pipelines_from_component_graphs(component_graphs_list, problem_type): }[handle_problem_types(problem_type)] created_pipelines = [] for component_graph in component_graphs_list: - for comp_name, comp_graph in component_graph.items(): - created_pipelines.append(pipeline_class(component_graph=comp_graph, custom_name=comp_name)) + comp_seed = 0 + if "random_seed" in component_graph.keys(): + comp_seed = component_graph.pop("random_seed") + comp_name = next(iter(component_graph)) + comp_graph = component_graph[comp_name] + created_pipelines.append(pipeline_class(component_graph=comp_graph, custom_name=comp_name, random_seed=comp_seed)) return created_pipelines diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 679174209f..0574f46cac 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -21,7 +21,7 @@ from evalml.automl.utils import ( _LARGE_DATA_PERCENT_VALIDATION, _LARGE_DATA_ROW_THRESHOLD, - get_default_primary_search_objective, + get_default_primary_search_objective, get_pipelines_from_component_graphs, ) from evalml.demos import load_breast_cancer, load_wine from evalml.exceptions import ( @@ -504,25 +504,6 @@ def test_automl_feature_selection(mock_fit, mock_score, X_y_binary): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 1.0} - class MockFeatureSelectionPipeline(BinaryClassificationPipeline): - component_graph = [ - "RF Classifier Select From Model", - "Logistic Regression Classifier", - ] - - def __init__(self, parameters, random_seed=0): - super().__init__(self.component_graph, parameters=parameters) - - def new(self, parameters, random_seed=0): - return self.__class__(parameters, random_seed=random_seed) - - def clone(self): - return self.__class__(self.parameters, random_seed=self.random_seed) - - def fit(self, X, y): - """Mock fit, noop""" - - allowed_pipelines = [MockFeatureSelectionPipeline({})] start_iteration_callback = MagicMock() automl = AutoMLSearch( X_train=X, @@ -530,7 +511,10 @@ def fit(self, X, y): problem_type="binary", max_iterations=2, start_iteration_callback=start_iteration_callback, - allowed_pipelines=allowed_pipelines, + allowed_component_graphs=[ + {"Name": ["RF Classifier Select From Model", + "Logistic Regression Classifier"]}, + ], ) automl.search() @@ -586,25 +570,25 @@ def test_automl_algorithm(mock_fit, mock_score, mock_algo_next_batch, X_y_binary @patch("evalml.automl.automl_algorithm.IterativeAlgorithm.__init__") -def test_automl_allowed_pipelines_algorithm( - mock_algo_init, dummy_binary_pipeline_class, X_y_binary +def test_automl_allowed_component_graphs_algorithm( + mock_algo_init, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary ): mock_algo_init.side_effect = Exception("mock algo init") X, y = X_y_binary - allowed_pipelines = [dummy_binary_pipeline_class({})] + allowed_component_graphs = [{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}] with pytest.raises(Exception, match="mock algo init"): AutoMLSearch( X_train=X, y_train=y, problem_type="binary", - allowed_pipelines=allowed_pipelines, + allowed_component_graphs=allowed_component_graphs, max_iterations=10, ) assert mock_algo_init.call_count == 1 _, kwargs = mock_algo_init.call_args assert kwargs["max_iterations"] == 10 - assert kwargs["allowed_pipelines"] == allowed_pipelines + assert kwargs["allowed_pipelines"] == get_pipelines_from_component_graphs(allowed_component_graphs, "binary") allowed_model_families = [ModelFamily.RANDOM_FOREST] with pytest.raises(Exception, match="mock algo init"): @@ -897,19 +881,19 @@ def test_data_splitter_shuffle(): ) -def test_allowed_pipelines_with_incorrect_problem_type( - dummy_binary_pipeline_class, X_y_binary +def test_component_graph_with_incorrect_problem_type( + dummy_classifier_estimator_class, X_y_binary ): X, y = X_y_binary - # checks that not setting allowed_pipelines does not error out + # checks that not setting component graphs does not error out AutoMLSearch(X_train=X, y_train=y, problem_type="binary") - with pytest.raises(ValueError, match="is not compatible with problem_type"): + with pytest.raises(ValueError, match="not valid for this component graph"): AutoMLSearch( X_train=X, y_train=y, problem_type="regression", - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], ) @@ -1010,7 +994,7 @@ def test_default_objective(X_y_binary): @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_add_to_rankings(mock_fit, mock_score, dummy_binary_component_graph, dummy_binary_pipeline_class, X_y_binary): +def test_add_to_rankings(mock_fit, mock_score, dummy_binary_linear_component_graph, dummy_binary_pipeline_class, X_y_binary): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 1.0} @@ -1019,7 +1003,7 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_component_graph, dum y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_binary_component_graph] + allowed_component_graphs=[dummy_binary_linear_component_graph] ) automl.search() assert len(automl.rankings) == 1 @@ -1054,7 +1038,7 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_component_graph, dum @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_no_search( - mock_fit, mock_score, dummy_binary_component_graph, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_binary_linear_component_graph, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary automl = AutoMLSearch( @@ -1062,7 +1046,7 @@ def test_add_to_rankings_no_search( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_binary_component_graph], + allowed_component_graphs=[dummy_binary_linear_component_graph], ) mock_score.return_value = {"Log Loss Binary": 0.5234} @@ -1086,14 +1070,14 @@ def test_add_to_rankings_no_search( @patch("evalml.pipelines.RegressionPipeline.score") -def test_add_to_rankings_regression_large(mock_score, dummy_regression_pipeline_class): +def test_add_to_rankings_regression_large(mock_score, dummy_regressor_linear_component_graph, dummy_regression_pipeline_class): X = pd.DataFrame({"col_0": [i for i in range(101000)]}) y = pd.Series([i for i in range(101000)]) automl = AutoMLSearch( X_train=X, y_train=y, - allowed_pipelines=[dummy_regression_pipeline_class({})], + allowed_component_graphs=[dummy_regressor_linear_component_graph], problem_type="regression", max_time=1, max_iterations=1, @@ -1126,14 +1110,14 @@ def test_add_to_rankings_new_pipeline(dummy_regression_pipeline_class): @patch("evalml.pipelines.RegressionPipeline.score") def test_add_to_rankings_regression( - mock_score, dummy_regression_pipeline_class, X_y_regression + mock_score, dummy_regressor_linear_component_graph, dummy_regression_pipeline_class, X_y_regression ): X, y = X_y_regression automl = AutoMLSearch( X_train=X, y_train=y, - allowed_pipelines=[dummy_regression_pipeline_class({})], + allowed_component_graphs=[dummy_regressor_linear_component_graph], problem_type="regression", max_time=1, max_iterations=1, @@ -1150,7 +1134,7 @@ def test_add_to_rankings_regression( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_duplicate( - mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_binary_linear_component_graph, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 0.1234} @@ -1160,7 +1144,7 @@ def test_add_to_rankings_duplicate( y_train=y, problem_type="binary", max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[dummy_binary_linear_component_graph], ) automl.search() best_pipeline = automl.best_pipeline @@ -1175,7 +1159,7 @@ def test_add_to_rankings_duplicate( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_trained( - mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 1.0} @@ -1188,10 +1172,9 @@ class CoolBinaryClassificationPipeline(dummy_binary_pipeline_class): y_train=y, problem_type="binary", max_iterations=1, - allowed_pipelines=[ - dummy_binary_pipeline_class({}), - CoolBinaryClassificationPipeline({}), - ], + allowed_component_graphs=[ + {"Cool Binary Classification Pipeline": [dummy_classifier_estimator_class]}, + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], ) automl.search() assert len(automl.rankings) == 1 @@ -1696,6 +1679,7 @@ def test_pipelines_in_batch_return_nan( mock_full_rankings, mock_next_batch, X_y_binary, + dummy_classifier_estimator_class, dummy_binary_pipeline_class, ): X, y = X_y_binary @@ -1721,7 +1705,7 @@ def test_pipelines_in_batch_return_nan( y_train=y, problem_type="binary", max_batches=3, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Name": [dummy_classifier_estimator_class]}], n_jobs=1, ) with pytest.raises( @@ -1739,6 +1723,7 @@ def test_pipelines_in_batch_return_none( mock_full_rankings, mock_next_batch, X_y_binary, + dummy_classifier_estimator_class, dummy_binary_pipeline_class, ): X, y = X_y_binary @@ -1764,7 +1749,7 @@ def test_pipelines_in_batch_return_none( y_train=y, problem_type="binary", max_batches=3, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Name": [dummy_classifier_estimator_class]}], n_jobs=1, ) with pytest.raises( @@ -1913,7 +1898,8 @@ class Pipeline2(DummyPipeline): y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_pipelines=[Pipeline1({}), Pipeline2({})], + allowed_component_graphs=[{"Pipeline1": {}}, + {"Pipeline2": {}}], objective=objective(0, 0, 0, 0), additional_objectives=[], optimize_thresholds=False, @@ -1925,7 +1911,7 @@ class Pipeline2(DummyPipeline): y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_pipelines=[ + allowed_component_graphs=[ Pipeline1({"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}}), Pipeline2({"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}}), ], @@ -1941,7 +1927,8 @@ class Pipeline2(DummyPipeline): y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_pipelines=[Pipeline1({}), Pipeline2({})], + allowed_component_graphs=[{"Pipeline1": {}}, + {"Pipeline2": {}}], objective=objective, additional_objectives=[], optimize_thresholds=False, @@ -2154,7 +2141,6 @@ def clone(self): y_train=y, problem_type="binary", max_iterations=2, - allowed_pipelines=[DummyPipeline({})], objective="log loss binary", additional_objectives=["f1"], ) @@ -2404,12 +2390,16 @@ def test_automl_one_allowed_pipeline_ensembling_disabled( caplog.clear() max_iterations = _get_first_stacked_classifier_no([ModelFamily.LINEAR_MODEL]) + 1 + allowed_component_graph = {"Logistic Regression Binary Pipeline": ["Imputer", + "One Hot Encoder", + "Standard Scaler", + "Logistic Regression Classifier",]} automl = AutoMLSearch( X_train=X, y_train=y, problem_type="binary", max_iterations=max_iterations, - allowed_pipelines=[logistic_regression_binary_pipeline_class({})], + allowed_component_graphs=[allowed_component_graph], ensembling=True, ) automl.search() @@ -2666,14 +2656,14 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors } estimators = get_estimators("multiclass", [ModelFamily.EXTRA_TREES]) - pipelines = [ - make_pipeline(X, y, estimator, "multiclass", None) for estimator in estimators + component_graphs = [ + {f'CG_{ind}': [estimator]} for ind, estimator in enumerate(estimators) ] automl = AutoMLSearch( X_train=X, y_train=y, problem_type="multiclass", - allowed_pipelines=pipelines, + allowed_component_graphs=component_graphs, custom_hyperparameters=custom_hyperparameters, n_jobs=1, ) @@ -2686,7 +2676,7 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors @pytest.mark.parametrize( - "pipelines,pipeline_parameters", [(True, False), (True, True), (False, False)] + "component_graphs", [True, False] ) @pytest.mark.parametrize("automl_parameters", [True, False]) @pytest.mark.parametrize("custom_hyperparameters", [True, False]) @@ -2700,38 +2690,21 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( mock_score, custom_hyperparameters, automl_parameters, - pipelines, - pipeline_parameters, + component_graphs, X_y_multi, ): X, y = X_y_multi X = pd.DataFrame(X, columns=[f"Column_{i}" for i in range(20)]) - pipeline_parameters_ = None - pipeline_ = None + component_graph_ = None automl_parameters_ = None custom_hyperparameters_ = None - if pipeline_parameters: - pipeline_parameters_ = { - "Drop Columns Transformer": { - "columns": ["Column_0", "Column_1", "Column_2"] - }, - "Imputer": {"numeric_impute_strategy": "most_frequent"}, - "Random Forest Classifier": {"n_estimators": 200, "max_depth": 11}, - } - - if pipelines: + if component_graphs: component_graph_ = [ - "Drop Columns Transformer", - "Imputer", - "Random Forest Classifier", - ] - pipeline_ = [ - MulticlassClassificationPipeline( - component_graph=component_graph_, parameters=pipeline_parameters_ - ) - ] + {"Name_0": ["Drop Columns Transformer", + "Imputer", + "Random Forest Classifier",]}] if automl_parameters: automl_parameters_ = { @@ -2753,7 +2726,7 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( X_train=X, y_train=y, problem_type="multiclass", - allowed_pipelines=pipeline_, + allowed_component_graphs=component_graph_, pipeline_parameters=automl_parameters_, custom_hyperparameters=custom_hyperparameters_, max_batches=4, @@ -2762,14 +2735,12 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( for i, row in automl.full_rankings.iterrows(): if "Random Forest Classifier" in row["pipeline_name"]: - if pipelines and automl_parameters: + if component_graph_ and automl_parameters: assert row["parameters"]["Drop Columns Transformer"]["columns"] == [ "Column_0", "Column_1", "Column_2", ] - elif pipeline_parameters: - assert row["parameters"]["Drop Columns Transformer"]["columns"] is None if custom_hyperparameters_: assert ( row["parameters"]["Imputer"]["numeric_impute_strategy"] @@ -2819,7 +2790,7 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( ) @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_iterative_algorithm_passes_njobs_to_pipelines( - mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary @@ -2842,14 +2813,10 @@ def __init__(self, n_jobs=-1, random_seed=0): problem_type="binary", n_jobs=3, max_batches=2, - allowed_pipelines=[ - BinaryClassificationPipeline( - [MockEstimatorWithNJobs], custom_name="Pipeline 1" - ), - BinaryClassificationPipeline( - [MockEstimatorWithNJobs], custom_name="Pipeline 2" - ), - dummy_binary_pipeline_class({}), + allowed_component_graphs=[ + {"Pipeline 1": [MockEstimatorWithNJobs]}, + {"Pipeline 2": [MockEstimatorWithNJobs]}, + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} ], ) automl.search() @@ -3040,40 +3007,17 @@ def test_automl_respects_random_seed( X, y = X_y_binary - class DummyPipeline(BinaryClassificationPipeline): - component_graph = [dummy_classifier_estimator_class] - num_pipelines_different_seed = 0 - num_pipelines_init = 0 - - def __init__(self, parameters, random_seed=0): - is_diff_random_seed = not (random_seed == 42) - self.__class__.num_pipelines_init += 1 - self.__class__.num_pipelines_different_seed += is_diff_random_seed - super().__init__( - self.component_graph, parameters=parameters, random_seed=random_seed - ) - - def new(self, parameters, random_seed=0): - return self.__class__(parameters, random_seed=random_seed) - - def clone(self): - return self.__class__(self.parameters, random_seed=self.random_seed) - - pipelines = [DummyPipeline({})] - DummyPipeline.num_pipelines_different_seed = 0 automl = AutoMLSearch( X_train=X, y_train=y, problem_type="binary", - allowed_pipelines=pipelines, + allowed_component_graphs=[{"Name_0": [dummy_classifier_estimator_class], + "random_seed": 0}], random_seed=42, max_iterations=10, ) automl.search() - assert ( - DummyPipeline.num_pipelines_different_seed == 0 - and DummyPipeline.num_pipelines_init - ) + assert automl.allowed_pipelines[0].random_seed == 0 @pytest.mark.parametrize( @@ -3559,12 +3503,12 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components( ) component_graph_linear = ["Imputer", "Imputer", "Random Forest Classifier"] - pipeline_linear = BinaryClassificationPipeline(component_graph_linear) automl = AutoMLSearch( X, y, problem_type="binary", - allowed_pipelines=[pipeline_dict, pipeline_linear], + allowed_component_graphs=[{"Pipeline from dict": component_graph_dict}, + {"Pipeline from linear": ["Imputer", "Imputer", "Random Forest Classifier"]}], pipeline_parameters={ "Imputer": {"numeric_impute_strategy": "most_frequent"}, "Imputer_1": {"numeric_impute_strategy": "median"}, @@ -3581,22 +3525,18 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components( "One Hot Encoder_1": ["One Hot Encoder", "One Hot Encoder"], "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder_1"], } - pipeline_dict = BinaryClassificationPipeline( - component_graph_dict, custom_name="Pipeline from dict" - ) - component_graph_linear = [ "One Hot Encoder", "One Hot Encoder", "Random Forest Classifier", ] - pipeline_linear = BinaryClassificationPipeline(component_graph_linear) automl = AutoMLSearch( X, y, problem_type="binary", - allowed_pipelines=[pipeline_linear, pipeline_dict], + allowed_component_graphs=[{"Pipeline from dict": component_graph_dict}, + {"Pipeline from linear": component_graph_linear}], pipeline_parameters={ "One Hot Encoder": {"top_n": 15}, "One Hot Encoder_1": {"top_n": 25}, @@ -3628,8 +3568,7 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen }, "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}, } - component_graph = ["Imputer", "Imputer", "Random Forest Classifier"] - pipeline_ = BinaryClassificationPipeline(component_graph) + component_graph = {"Name_linear": ["Imputer", "Imputer", "Random Forest Classifier"]} else: custom_hyperparameters = { "Imputer": { @@ -3638,49 +3577,49 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen "Imputer_1": {"numeric_impute_strategy": Categorical(["median", "mean"])}, "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}, } - component_graph = { + component_graph = {"Name_dict": { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"], - } - pipeline_ = BinaryClassificationPipeline( - component_graph, custom_name="Pipeline from dict" - ) + }} automl = AutoMLSearch( X, y, problem_type="binary", - allowed_pipelines=[pipeline_], + allowed_component_graphs=[component_graph], custom_hyperparameters=custom_hyperparameters, max_batches=5, ) + from pprint import pp automl.search() for i, row in automl.full_rankings.iterrows(): if "Mode Baseline Binary" in row["pipeline_name"]: continue - if row["pipeline_name"] == "Pipeline Dict": - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in { - "most_frequent", - "mean", - } + if row["pipeline_name"] == "Name_linear": + print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") + pp(row.parameters) + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] == "mean" assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in { - "median", + "most_frequent", "mean", } assert row["parameters"]["Random Forest Classifier"]["n_estimators"] in { - 50, 100, + 125, } if row["pipeline_name"] == "Pipe Line Linear": - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] == "mean" - assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in { + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in { "most_frequent", "mean", } + assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in { + "median", + "mean", + } assert row["parameters"]["Random Forest Classifier"]["n_estimators"] in { + 50, 100, - 125, } @@ -3694,30 +3633,20 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams( ): X, y = X_y_binary - component_graph = { + component_graph_0 = {"Pipe Line One": { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"], - } - pipeline_one = BinaryClassificationPipeline( - component_graph, custom_name="Pipe Line One" - ) - pipeline_two = BinaryClassificationPipeline( - ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], - custom_name="Pipe Line Two", - ) - - pipeline_three = BinaryClassificationPipeline( - ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"], - custom_name="Pipe Line Three", - ) + }} + component_graph_1 = {"Pipe Line Two": ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"]} + component_graph_2 = {"Pipe Line Three": ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"]} automl = AutoMLSearch( X, y, problem_type="binary", - allowed_pipelines=[pipeline_one, pipeline_two, pipeline_three], + allowed_component_graphs=[component_graph_0, component_graph_1, component_graph_2], pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}}, custom_hyperparameters={ "One Hot Encoder": {"top_n": Categorical([12, 10])}, @@ -3847,20 +3776,11 @@ def test_automl_check_high_variance_logs_warning(mock_fit_binary, X_y_binary, ca def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y = X_y_binary - pipeline_1 = BinaryClassificationPipeline( - component_graph=["Imputer", "Random Forest Classifier"], - custom_name="Custom Pipeline", - ) - pipeline_2 = BinaryClassificationPipeline( - component_graph=["Imputer", "Logistic Regression Classifier"], - custom_name="Custom Pipeline", - ) - pipeline_3 = BinaryClassificationPipeline( - component_graph=["Logistic Regression Classifier"], custom_name="My Pipeline 3" - ) - pipeline_4 = BinaryClassificationPipeline( - component_graph=["Random Forest Classifier"], custom_name="My Pipeline 3" - ) + + component_graph_0 = {"Custom Pipeline": ["Imputer", "Random Forest Classifier"]} + component_graph_1 = {"Custom Pipeline": ["Imputer", "Logistic Regression Classifier"]} + component_graph_2 = {"My Pipeline 3": ["Logistic Regression Classifier"]} + component_graph_3 = {"My Pipeline 3": ["Random Forest Classifier"]} with pytest.raises( ValueError, @@ -3870,7 +3790,7 @@ def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y, problem_type="binary", - allowed_pipelines=[pipeline_1, pipeline_2, pipeline_3], + allowed_component_graphs=[component_graph_0, component_graph_1, component_graph_2], ) with pytest.raises( @@ -3881,7 +3801,7 @@ def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y, problem_type="binary", - allowed_pipelines=[pipeline_1, pipeline_2, pipeline_3, pipeline_4], + allowed_component_graphs=[component_graph_0, component_graph_1, component_graph_2, component_graph_3], ) @@ -4092,6 +4012,7 @@ def test_score_batch_works( mock_score, pipeline_score_side_effect, X_y_binary, + dummy_classifier_estimator_class, dummy_binary_pipeline_class, stackable_classifiers, caplog, @@ -4121,7 +4042,7 @@ def test_score_batch_works( y_train=y, problem_type="binary", max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], ) def make_pipeline_name(index): @@ -4171,7 +4092,7 @@ def score_batch_and_check(): def test_train_pipelines_score_pipelines_raise_exception_with_duplicate_names( - X_y_binary, dummy_binary_pipeline_class + X_y_binary, dummy_classifier_estimator_class, dummy_binary_pipeline_class ): class Pipeline1(dummy_binary_pipeline_class): custom_name = "My Pipeline" @@ -4186,7 +4107,7 @@ class Pipeline2(dummy_binary_pipeline_class): y_train=y, problem_type="binary", max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], ) with pytest.raises( @@ -4203,7 +4124,7 @@ class Pipeline2(dummy_binary_pipeline_class): def test_score_batch_before_fitting_yields_error_nan_scores( - X_y_binary, dummy_binary_pipeline_class, caplog + X_y_binary, dummy_classifier_estimator_class, dummy_binary_pipeline_class, caplog ): X, y = X_y_binary @@ -4212,7 +4133,7 @@ def test_score_batch_before_fitting_yields_error_nan_scores( y_train=y, problem_type="binary", max_iterations=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], ) scored_pipelines = automl.score_pipelines( diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 5b05d67e4b..87084d3bd5 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -10,6 +10,7 @@ _LARGE_DATA_PERCENT_VALIDATION, _LARGE_DATA_ROW_THRESHOLD, get_best_sampler_for_data, + get_pipelines_from_component_graphs, get_default_primary_search_objective, get_hyperparameter_ranges, make_data_splitter, @@ -309,3 +310,6 @@ def test_get_hyperparameter_ranges(): ) assert algo_ranges == hyper_ranges + + +def test_get_pipelines_from_component_graphs(): diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 0d39be38dd..227b435dd6 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -348,9 +348,35 @@ def fit(self, X, y): @pytest.fixture -def dummy_binary_component_graph(dummy_classifier_estimator_class): - component_graph = {"Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class]} - return component_graph +def dummy_binary_linear_component_graph(dummy_classifier_estimator_class): + component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class]} + return component_graph_linear + + +@pytest.fixture +def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): + component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_regressor_estimator_class]} + return component_graph_linear + + +@pytest.fixture +def dummy_binary_dict_component_graph(dummy_classifier_estimator_class): + component_graph_dict = { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_classifier_estimator_class, "Imputer_1"], + } + return component_graph_dict + + +@pytest.fixture +def dummy_regressor_dict_component_graph(dummy_regressor_estimator_class): + component_graph_dict = { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_regressor_estimator_class, "Imputer_1"], + } + return component_graph_dict @pytest.fixture From 562648acba38c2c0c07708640a2dc8e9674b908c Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 9 Jun 2021 15:53:40 -0400 Subject: [PATCH 59/85] test updates --- .../automl_algorithm/iterative_algorithm.py | 15 ++- evalml/automl/automl_search.py | 30 +++-- evalml/automl/utils.py | 5 +- evalml/objectives/standard_metrics.py | 1 + evalml/pipelines/pipeline_base.py | 1 + .../time_series_regression_pipeline.py | 2 +- evalml/tests/automl_tests/test_automl.py | 104 +++++++++--------- .../test_automl_search_regression.py | 60 +++++----- .../tests/automl_tests/test_automl_utils.py | 26 ++++- evalml/tests/conftest.py | 8 +- 10 files changed, 140 insertions(+), 112 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 57a205ec1a..ff507f97da 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -37,7 +37,6 @@ def __init__( text_in_ensembling=False, pipeline_params=None, custom_hyperparameters=None, - _frozen_pipeline_parameters=None, _estimator_family_order=None, ): """An automl algorithm which first fits a base round of pipelines with default parameters, then does a round of parameter tuning on each pipeline in order of performance. @@ -54,7 +53,6 @@ def __init__( text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk. pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. - _frozen_pipeline_parameters (dict or None): Pipeline-level parameters are frozen and used in the proposed pipelines. _estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to _ESTIMATOR_FAMILY_ORDER. """ self._estimator_family_order = ( @@ -95,7 +93,6 @@ def __init__( self.text_in_ensembling = text_in_ensembling self._pipeline_params = pipeline_params or {} self._custom_hyperparameters = custom_hyperparameters or {} - self._frozen_pipeline_parameters = _frozen_pipeline_parameters or {} def next_batch(self): """Get the next batch of pipelines to evaluate @@ -159,6 +156,9 @@ def next_batch(self): f"iterativealgorothm - next_batch - proposed_parameters: {proposed_parameters}" ) parameters = self._combine_parameters(pipeline, proposed_parameters) + print( + f"iterativealgorothm - next_batch - parameters: {parameters}" + ) next_batch.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) @@ -167,10 +167,12 @@ def next_batch(self): return next_batch def _combine_parameters(self, pipeline, proposed_parameters): - """Helper function for logic to transform proposed parameters and frozen parameters.""" + """Helper function for logic to transform proposed parameters.""" + print(f"iterative algorithm - _combine_parameters - proposed_parameters: {proposed_parameters}") + print(f"iterative algorithm - _combine_parameters - pipeline: {pipeline.parameters}") + print("----------------------------") return { - **self._transform_parameters(pipeline, proposed_parameters), - **self._frozen_pipeline_parameters, + **self._transform_parameters(pipeline, proposed_parameters) } def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): @@ -278,4 +280,5 @@ def _transform_parameters(self, pipeline, proposed_parameters): if param_name in init_params: component_parameters[param_name] = value parameters[name] = component_parameters + print(f"iterative algorithm - transform parameters - parameters: {parameters}") return parameters diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index fb283488b6..0d3a95810e 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -387,11 +387,16 @@ def __init__( logger.warning( "Unable to import plotly; skipping pipeline search plotting\n" ) - print(allowed_component_graphs) - if allowed_component_graphs is not None and not isinstance(allowed_component_graphs, list): - raise ValueError( - "Parameter allowed_component_graphs must be either None or a list!" - ) + if allowed_component_graphs is not None: + if not isinstance(allowed_component_graphs, list): + raise ValueError( + "Parameter allowed_component_graphs must be either None or a list!" + ) + for graph in allowed_component_graphs: + if not isinstance(graph, dict): + raise ValueError( + "Every component graph passed must be of type dictionary!" + ) self.allowed_component_graphs = allowed_component_graphs self.allowed_model_families = allowed_model_families self._automl_algorithm = None @@ -435,9 +440,6 @@ def __init__( if self.problem_configuration: parameters.update({"pipeline": self.problem_configuration}) - self._frozen_pipeline_parameters.update( - {"pipeline": self.problem_configuration} - ) self.sampler_method = sampler_method self.sampler_balanced_ratio = sampler_balanced_ratio @@ -460,9 +462,6 @@ def __init__( parameters[self._sampler_name].update( {"sampling_ratio": self.sampler_balanced_ratio} ) - self._frozen_pipeline_parameters[self._sampler_name] = parameters[ - self._sampler_name - ] if self.allowed_component_graphs is None: logger.info("Generating pipelines to search over...") @@ -488,16 +487,14 @@ def __init__( self.y_train, estimator, self.problem_type, - parameters=self._frozen_pipeline_parameters, + parameters=parameters, sampler_name=self._sampler_name, ) for estimator in allowed_estimators ] else: - self.allowed_pipelines = get_pipelines_from_component_graphs(self.allowed_component_graphs, self.problem_type) - print("AutoMLSearch - init - self.allowed_pipelines") - print('---------------------------------------------') - print(self.allowed_pipelines) + self.allowed_pipelines = get_pipelines_from_component_graphs(self.allowed_component_graphs, self.problem_type, parameters + ) if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") @@ -598,7 +595,6 @@ def __init__( text_in_ensembling=text_in_ensembling, pipeline_params=parameters, custom_hyperparameters=custom_hyperparameters, - _frozen_pipeline_parameters=self._frozen_pipeline_parameters, ) def _get_batch_number(self): diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 568e25ecc6..402ac1732c 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -223,7 +223,7 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): return hyperparameter_ranges -def get_pipelines_from_component_graphs(component_graphs_list, problem_type): +def get_pipelines_from_component_graphs(component_graphs_list, problem_type, parameters): """ Returns created pipelines from passed component graphs based on the specified problem type. @@ -249,5 +249,6 @@ def get_pipelines_from_component_graphs(component_graphs_list, problem_type): comp_seed = component_graph.pop("random_seed") comp_name = next(iter(component_graph)) comp_graph = component_graph[comp_name] - created_pipelines.append(pipeline_class(component_graph=comp_graph, custom_name=comp_name, random_seed=comp_seed)) + print(f"utils - get_pipelines_from_component_graphs - parameters: {parameters}") + created_pipelines.append(pipeline_class(component_graph=comp_graph, parameters=parameters, custom_name=comp_name, random_seed=comp_seed)) return created_pipelines diff --git a/evalml/objectives/standard_metrics.py b/evalml/objectives/standard_metrics.py index 6fbfbded7c..d10dabd320 100644 --- a/evalml/objectives/standard_metrics.py +++ b/evalml/objectives/standard_metrics.py @@ -414,6 +414,7 @@ class R2(RegressionObjective): is_bounded_like_percentage = False # Range (-Inf, 1] def objective_function(self, y_true, y_predicted, X=None): + print(f"standard metrics - R2 - score: {metrics.r2_score(y_true, y_predicted)}") return metrics.r2_score(y_true, y_predicted) diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index a241bc078e..6b62120b86 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -70,6 +70,7 @@ def __init__( component_graph, random_seed=self.random_seed ) else: + print(f"pipeline_base - __init__ - component_graph: {component_graph}") self._component_graph = ComponentGraph( component_dict=component_graph, random_seed=self.random_seed ) diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index c7bf4bce7e..9adff50698 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -38,7 +38,7 @@ def __init__( Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}). random_seed (int): Seed for the random number generator. Defaults to 0. """ - if "pipeline" not in parameters: + if not parameters or "pipeline" not in parameters: raise ValueError( "date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " "Please specify them as a dictionary with the key 'pipeline'." diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 0574f46cac..b1b51b7568 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1841,6 +1841,9 @@ def test_percent_better_than_baseline_in_rankings( pipeline_scores, baseline_score, problem_type_value, + dummy_classifier_estimator_class, + dummy_regressor_estimator_class, + dummy_time_series_regressor_estimator_class, dummy_binary_pipeline_class, dummy_multiclass_pipeline_class, dummy_regression_pipeline_class, @@ -1853,11 +1856,11 @@ def test_percent_better_than_baseline_in_rankings( # Ok to only use binary labels since score and fit methods are mocked X, y = X_y_binary - pipeline_class = { - ProblemTypes.BINARY: dummy_binary_pipeline_class, - ProblemTypes.MULTICLASS: dummy_multiclass_pipeline_class, - ProblemTypes.REGRESSION: dummy_regression_pipeline_class, - ProblemTypes.TIME_SERIES_REGRESSION: dummy_time_series_regression_pipeline_class, + estimator_class = { + ProblemTypes.BINARY: dummy_classifier_estimator_class, + ProblemTypes.MULTICLASS: dummy_classifier_estimator_class, + ProblemTypes.REGRESSION: dummy_regressor_estimator_class, + ProblemTypes.TIME_SERIES_REGRESSION: dummy_time_series_regressor_estimator_class, }[problem_type_value] baseline_pipeline_class = { ProblemTypes.BINARY: "evalml.pipelines.BinaryClassificationPipeline", @@ -1866,31 +1869,15 @@ def test_percent_better_than_baseline_in_rankings( ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline", }[problem_type_value] - class DummyPipeline(pipeline_class): - problem_type = problem_type_value - - def __init__(self, parameters, random_seed=0): - super().__init__(parameters=parameters) - - def new(self, parameters, random_seed=0): - return self.__class__(parameters, random_seed=random_seed) - - def clone(self): - return self.__class__(self.parameters, random_seed=self.random_seed) - - def fit(self, *args, **kwargs): - """Mocking fit""" - - class Pipeline1(DummyPipeline): - custom_name = "Pipeline1" - - class Pipeline2(DummyPipeline): - custom_name = "Pipeline2" - + created_pipelines = get_pipelines_from_component_graphs(component_graphs_list=[{"Pipeline1": [estimator_class]}, + {"Pipeline2": [estimator_class]}], + problem_type=problem_type_value) mock_score_1 = MagicMock(return_value={objective.name: pipeline_scores[0]}) mock_score_2 = MagicMock(return_value={objective.name: pipeline_scores[1]}) - Pipeline1.score = mock_score_1 - Pipeline2.score = mock_score_2 + created_pipelines[0].score = mock_score_1 + created_pipelines[1].score = mock_score_2 + print('##################################') + print(created_pipelines[0].score) if objective.name.lower() == "cost benefit matrix": automl = AutoMLSearch( @@ -1898,8 +1885,6 @@ class Pipeline2(DummyPipeline): y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_component_graphs=[{"Pipeline1": {}}, - {"Pipeline2": {}}], objective=objective(0, 0, 0, 0), additional_objectives=[], optimize_thresholds=False, @@ -1927,13 +1912,14 @@ class Pipeline2(DummyPipeline): y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_component_graphs=[{"Pipeline1": {}}, - {"Pipeline2": {}}], + allowed_component_graphs=[{"Pipeline1": [estimator_class]}, + {"Pipeline2": [estimator_class]}], objective=objective, additional_objectives=[], optimize_thresholds=False, n_jobs=1, ) + #automl.allowed_pipelines = created_pipelines with patch( baseline_pipeline_class + ".score", @@ -1976,6 +1962,8 @@ class Pipeline2(DummyPipeline): 2, ), } + print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') + print(scores) for name in answers: np.testing.assert_almost_equal(scores[name], answers[name], decimal=3) @@ -2073,7 +2061,7 @@ def fit(self, *args, **kwargs): y_train=y, problem_type=problem_type, max_iterations=2, - allowed_pipelines=[DummyPipeline(parameters)], + allowed_component_graphs=[{"Name_0": ["Imputer", "ARIMA Regressor"]}], objective="auto", problem_configuration={"date_index": None, "gap": 1, "max_delay": 1}, additional_objectives=additional_objectives, @@ -2100,6 +2088,25 @@ def fit(self, *args, **kwargs): ) +def test_time_series_regression_with_parameters(ts_data): + X, y = ts_data + X.index.name = "Date" + problem_configuration = {"date_index": "ssate", "gap": 1, "max_delay": 0} + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="time series regression", + allowed_component_graphs=[{"Name_0": ["Imputer", "ARIMA Regressor"]}], + objective="auto", + problem_configuration=problem_configuration, + max_batches=3 + ) + automl.search() + print("--------------------------") + print(automl.allowed_pipelines[0].parameters) + assert automl.allowed_pipelines[0].parameters["pipeline"] == problem_configuration + + @pytest.mark.parametrize("fold_scores", [[2, 4, 6], [np.nan, 4, 6]]) @patch( "evalml.pipelines.BinaryClassificationPipeline.score", @@ -2361,7 +2368,7 @@ def test_early_stopping(caplog, logistic_regression_binary_pipeline_class, X_y_b return_value={"Log Loss Binary": 0.8}, ) @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_automl_one_allowed_pipeline_ensembling_disabled( +def test_automl_one_allowed_component_graph_ensembling_disabled( mock_pipeline_fit, mock_score, X_y_binary, @@ -2369,7 +2376,7 @@ def test_automl_one_allowed_pipeline_ensembling_disabled( caplog, ): max_iterations = _get_first_stacked_classifier_no([ModelFamily.RANDOM_FOREST]) + 1 - # Checks that when len(allowed_pipeline) == 1, ensembling is not run, even if set to True + # Checks that when len(allowed_component_graphs) == 1, ensembling is not run, even if set to True X, y = X_y_binary automl = AutoMLSearch( X_train=X, @@ -2409,7 +2416,7 @@ def test_automl_one_allowed_pipeline_ensembling_disabled( "Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run." in caplog.text ) - # Check that ensembling runs when len(allowed_model_families) == 1 but len(allowed_pipelines) > 1 + # Check that ensembling runs when len(allowed_model_families) == 1 but len(allowed_component_graphs) > 1 caplog.clear() automl = AutoMLSearch( X_train=X, @@ -4260,7 +4267,6 @@ def test_automl_drop_index_columns(mock_train, mock_binary_score, X_y_binary): automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", max_batches=2) automl.search() for pipeline in automl.allowed_pipelines: - print(pipeline.parameters) assert pipeline.get_component("Drop Columns Transformer") assert "Drop Columns Transformer" in pipeline.parameters assert pipeline.parameters["Drop Columns Transformer"] == { @@ -4276,37 +4282,29 @@ def test_automl_drop_index_columns(mock_train, mock_binary_score, X_y_binary): assert all(param == ["index_col"] for param in all_drop_column_params) -def test_automl_validates_data_passed_in_to_allowed_pipelines( - X_y_binary, dummy_binary_pipeline_class +def test_automl_validates_data_passed_in_to_allowed_component_graphs( + X_y_binary, dummy_classifier_estimator_class, dummy_binary_pipeline_class ): X, y = X_y_binary with pytest.raises( - ValueError, match="Parameter allowed_pipelines must be either None or a list!" - ): - AutoMLSearch( - X, y, problem_type="binary", allowed_pipelines=dummy_binary_pipeline_class - ) - - with pytest.raises( - ValueError, - match="Every element of allowed_pipelines must an instance of PipelineBase!", + ValueError, match="Parameter allowed_component_graphs must be either None or a list!" ): AutoMLSearch( - X, y, problem_type="binary", allowed_pipelines=[dummy_binary_pipeline_class] + X, y, problem_type="binary", allowed_component_graphs={"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} ) with pytest.raises( ValueError, - match="Every element of allowed_pipelines must an instance of PipelineBase!", + match="Every component graph passed must be of type dictionary!", ): AutoMLSearch( X, y, problem_type="binary", - allowed_pipelines=[ - dummy_binary_pipeline_class.custom_name, - dummy_binary_pipeline_class, + allowed_component_graphs=[ + "Mock Binary Classification Pipeline", + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, ], ) diff --git a/evalml/tests/automl_tests/test_automl_search_regression.py b/evalml/tests/automl_tests/test_automl_search_regression.py index f1df1acd3e..958cb9acf7 100644 --- a/evalml/tests/automl_tests/test_automl_search_regression.py +++ b/evalml/tests/automl_tests/test_automl_search_regression.py @@ -215,47 +215,55 @@ def test_log_metrics_only_passed_directly(X_y_regression): assert ar.additional_objectives[1].name == "Mean Squared Log Error" -def test_automl_allowed_pipelines_no_allowed_pipelines(X_y_regression): +def test_automl_allowed_pipelines_no_allowed_component_graphs(X_y_regression): X, y = X_y_regression with pytest.raises(ValueError, match="No allowed pipelines to search"): AutoMLSearch( X_train=X, y_train=y, problem_type="regression", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=[], ) @patch("evalml.pipelines.RegressionPipeline.score") @patch("evalml.pipelines.RegressionPipeline.fit") -def test_automl_allowed_pipelines_specified_allowed_pipelines( - mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression +def test_automl_allowed_component_graphs_specified_allowed_pipelines( + mock_fit, mock_score, dummy_regressor_estimator_class, dummy_regression_pipeline_class, X_y_regression ): X, y = X_y_regression + automl = AutoMLSearch( X_train=X, y_train=y, problem_type="regression", - allowed_pipelines=[dummy_regression_pipeline_class({})], + allowed_component_graphs=[{"Mock Regression Pipeline": [dummy_regressor_estimator_class] }], allowed_model_families=None, ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [dummy_regression_pipeline_class({})] + expected_pipeline = dummy_regression_pipeline_class({}) + expected_component_graph = expected_pipeline.component_graph + expected_name = expected_pipeline.name + expected_oarameters = expected_pipeline.parameters mock_score.return_value = {automl.objective.name: 1.0} - assert automl.allowed_pipelines == expected_pipelines + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_oarameters assert automl.allowed_model_families == [ModelFamily.NONE] automl.search() mock_fit.assert_called() mock_score.assert_called() - assert automl.allowed_pipelines == expected_pipelines + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_oarameters assert automl.allowed_model_families == [ModelFamily.NONE] @patch("evalml.pipelines.RegressionPipeline.score") @patch("evalml.pipelines.RegressionPipeline.fit") -def test_automl_allowed_pipelines_specified_allowed_model_families( +def test_automl_allowed_component_graphs_specified_allowed_model_families( mock_fit, mock_score, X_y_regression, assert_allowed_pipelines_equal_helper ): X, y = X_y_regression @@ -263,7 +271,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families( X_train=X, y_train=y, problem_type="regression", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -285,7 +293,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families( X_train=X, y_train=y, problem_type="regression", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=["random_forest"], ) expected_pipelines = [ @@ -303,7 +311,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families( @patch("evalml.pipelines.RegressionPipeline.score") @patch("evalml.pipelines.RegressionPipeline.fit") -def test_automl_allowed_pipelines_init_allowed_both_not_specified( +def test_automl_allowed_component_graphs_init_allowed_both_not_specified( mock_fit, mock_score, X_y_regression, assert_allowed_pipelines_equal_helper ): X, y = X_y_regression @@ -311,7 +319,7 @@ def test_automl_allowed_pipelines_init_allowed_both_not_specified( X_train=X, y_train=y, problem_type="regression", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=None, ) mock_score.return_value = {automl.objective.name: 1.0} @@ -330,9 +338,10 @@ def test_automl_allowed_pipelines_init_allowed_both_not_specified( @patch("evalml.pipelines.RegressionPipeline.score") @patch("evalml.pipelines.RegressionPipeline.fit") -def test_automl_allowed_pipelines_init_allowed_both_specified( +def test_automl_allowed_component_graphs_init_allowed_both_specified( mock_fit, mock_score, + dummy_regressor_estimator_class, dummy_regression_pipeline_class, X_y_regression, assert_allowed_pipelines_equal_helper, @@ -342,7 +351,7 @@ def test_automl_allowed_pipelines_init_allowed_both_specified( X_train=X, y_train=y, problem_type="regression", - allowed_pipelines=[dummy_regression_pipeline_class({})], + allowed_component_graphs=[{"Mock Regression Pipeline": [dummy_regressor_estimator_class]}], allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -359,22 +368,21 @@ def test_automl_allowed_pipelines_init_allowed_both_specified( @pytest.mark.parametrize("is_linear", [True, False]) @patch("evalml.pipelines.RegressionPipeline.score") @patch("evalml.pipelines.RegressionPipeline.fit") -def test_automl_allowed_pipelines_search( +def test_automl_allowed_component_graphs_search( mock_fit, mock_score, is_linear, - dummy_regression_pipeline_class, - nonlinear_regression_pipeline_class, + dummy_regressor_linear_component_graph, + dummy_regressor_dict_component_graph, X_y_regression, ): X, y = X_y_regression mock_score.return_value = {"R2": 1.0} - pipeline_class = ( - dummy_regression_pipeline_class + component_graph = ( + dummy_regressor_linear_component_graph if is_linear - else nonlinear_regression_pipeline_class + else dummy_regressor_dict_component_graph ) - allowed_pipelines = [pipeline_class({})] start_iteration_callback = MagicMock() automl = AutoMLSearch( @@ -383,15 +391,13 @@ def test_automl_allowed_pipelines_search( problem_type="regression", max_iterations=2, start_iteration_callback=start_iteration_callback, - allowed_pipelines=allowed_pipelines, + allowed_component_graphs=[component_graph], ) automl.search() assert start_iteration_callback.call_count == 2 - assert isinstance( - start_iteration_callback.call_args_list[0][0][0], RegressionPipeline - ) - assert isinstance(start_iteration_callback.call_args_list[1][0][0], pipeline_class) + assert isinstance(start_iteration_callback.call_args_list[0][0][0], RegressionPipeline) + assert isinstance(start_iteration_callback.call_args_list[1][0][0], RegressionPipeline) @patch("evalml.pipelines.TimeSeriesRegressionPipeline.score", return_value={"R2": 0.3}) diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 87084d3bd5..2924fb9833 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -17,7 +17,8 @@ tune_binary_threshold, ) from evalml.objectives import F1, R2, LogLossBinary, LogLossMulticlass -from evalml.pipelines import BinaryClassificationPipeline +from evalml.pipelines import BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline, \ + TimeSeriesRegressionPipeline from evalml.preprocessing.data_splitters import ( TimeSeriesSplit, TrainingValidationSplit, @@ -312,4 +313,25 @@ def test_get_hyperparameter_ranges(): assert algo_ranges == hyper_ranges -def test_get_pipelines_from_component_graphs(): +@pytest.mark.parametrize("problem_type,estimator", [("binary", "Random Forest Classifier"), + ("multiclass", "Random Forest Classifier"), + ("regression", "Random Forest Regressor"), + ("time series regression", "ARIMA Regressor")]) +def test_get_pipelines_from_component_graphs(problem_type, estimator): + component_graphs = [{"Name_0": ["Imputer", estimator]}, + {"Name_1": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + estimator: [estimator, "Imputer_1"], + }}] + if problem_type == "time series regression": + with pytest.raises(ValueError, match="date_index, gap, and max_delay"): + get_pipelines_from_component_graphs(component_graphs, problem_type) + else: + returned_pipelines = get_pipelines_from_component_graphs(component_graphs, problem_type) + if problem_type == "binary": + assert all(isinstance(pipe_, BinaryClassificationPipeline) for pipe_ in returned_pipelines) + elif problem_type == "multiclass": + assert all(isinstance(pipe_, MulticlassClassificationPipeline) for pipe_ in returned_pipelines) + elif problem_type == "regression": + assert all(isinstance(pipe_, RegressionPipeline) for pipe_ in returned_pipelines) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 227b435dd6..b40fa27e61 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -361,21 +361,21 @@ def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): @pytest.fixture def dummy_binary_dict_component_graph(dummy_classifier_estimator_class): - component_graph_dict = { + component_graph_dict = {"Name": { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], "Random Forest Classifier": [dummy_classifier_estimator_class, "Imputer_1"], - } + }} return component_graph_dict @pytest.fixture def dummy_regressor_dict_component_graph(dummy_regressor_estimator_class): - component_graph_dict = { + component_graph_dict = {"Name": { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], "Random Forest Classifier": [dummy_regressor_estimator_class, "Imputer_1"], - } + }} return component_graph_dict From b8d4a633945008359be15b20d927e285225238eb Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 9 Jun 2021 20:38:24 -0400 Subject: [PATCH 60/85] test updates --- evalml/automl/automl_search.py | 8 +- evalml/automl/utils.py | 4 +- .../test_automl_search_classification.py | 146 +++++++++++------- .../test_automl_search_regression.py | 4 +- .../tests/automl_tests/test_automl_utils.py | 5 +- evalml/tests/conftest.py | 82 +++++++++- 6 files changed, 177 insertions(+), 72 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index f46f1dd0b4..24b5a0bee8 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -423,7 +423,6 @@ def __init__( self.custom_hyperparameters = custom_hyperparameters or {} self.search_iteration_plot = None self._interrupted = False - self._frozen_pipeline_parameters = {} parameters = copy.copy(self.pipeline_parameters) @@ -467,7 +466,7 @@ def __init__( ) index_columns = list(self.X_train.ww.select("index").columns) if len(index_columns) > 0 and drop_columns is None: - self._frozen_pipeline_parameters["Drop Columns Transformer"] = { + parameters["Drop Columns Transformer"] = { "columns": index_columns } self.allowed_pipelines = [ @@ -482,8 +481,9 @@ def __init__( for estimator in allowed_estimators ] else: - self.allowed_pipelines = get_pipelines_from_component_graphs(self.allowed_component_graphs, self.problem_type, parameters - ) + self.allowed_pipelines = get_pipelines_from_component_graphs(self.allowed_component_graphs, + self.problem_type, + parameters) if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 3c3d5ab099..094a441bf1 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -223,13 +223,14 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): return hyperparameter_ranges -def get_pipelines_from_component_graphs(component_graphs_list, problem_type, parameters): +def get_pipelines_from_component_graphs(component_graphs_list, problem_type, parameters=None): """ Returns created pipelines from passed component graphs based on the specified problem type. Arguments: component_graphs_list (list): The list of component graphs. problem_type (str or ProblemType): The problem type for which pipelines will be created. + parameters (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Returns: list: List of pipelines made from the passed component graphs. @@ -249,6 +250,5 @@ def get_pipelines_from_component_graphs(component_graphs_list, problem_type, par comp_seed = component_graph.pop("random_seed") comp_name = next(iter(component_graph)) comp_graph = component_graph[comp_name] - print(f"utils - get_pipelines_from_component_graphs - parameters: {parameters}") created_pipelines.append(pipeline_class(component_graph=comp_graph, parameters=parameters, custom_name=comp_name, random_seed=comp_seed)) return created_pipelines diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py index 3f6a29b1e5..72bc08c837 100644 --- a/evalml/tests/automl_tests/test_automl_search_classification.py +++ b/evalml/tests/automl_tests/test_automl_search_classification.py @@ -689,7 +689,7 @@ def test_max_time(X_y_binary): @pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) -def test_automl_allowed_pipelines_no_allowed_pipelines( +def test_automl_allowed_component_graphs_no_component_graphs( automl_type, X_y_binary, X_y_multi ): is_multiclass = automl_type == ProblemTypes.MULTICLASS @@ -700,64 +700,78 @@ def test_automl_allowed_pipelines_no_allowed_pipelines( X_train=X, y_train=y, problem_type=problem_type, - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=[], ) @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_automl_allowed_pipelines_specified_allowed_pipelines_binary( - mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary +def test_automl_component_graphs_specified_component_graphs_binary( + mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary automl = AutoMLSearch( X_train=X, y_train=y, problem_type="binary", - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] }], allowed_model_families=None, ) - expected_pipelines = [dummy_binary_pipeline_class({})] + expected_pipeline = dummy_binary_pipeline_class({}) + expected_component_graph = expected_pipeline.component_graph + expected_name = expected_pipeline.name + expected_parameters = expected_pipeline.parameters mock_score.return_value = {automl.objective.name: 1.0} - assert automl.allowed_pipelines == expected_pipelines + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_parameters assert automl.allowed_model_families == [ModelFamily.NONE] automl.search() mock_fit.assert_called() mock_score.assert_called() - assert automl.allowed_pipelines == expected_pipelines + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_parameters assert automl.allowed_model_families == [ModelFamily.NONE] @patch("evalml.pipelines.MulticlassClassificationPipeline.score") @patch("evalml.pipelines.MulticlassClassificationPipeline.fit") -def test_automl_allowed_pipelines_specified_allowed_pipelines_multi( - mock_fit, mock_score, dummy_multiclass_pipeline_class, X_y_multi +def test_automl_component_graphs_specified_component_graphs_multi( + mock_fit, mock_score, dummy_classifier_estimator_class, dummy_multiclass_pipeline_class, X_y_multi ): X, y = X_y_multi automl = AutoMLSearch( X_train=X, y_train=y, problem_type="multiclass", - allowed_pipelines=[dummy_multiclass_pipeline_class({})], + allowed_component_graphs=[{"Mock Multiclass Classification Pipeline": [dummy_classifier_estimator_class] }], allowed_model_families=None, ) - expected_pipelines = [dummy_multiclass_pipeline_class({})] + expected_pipeline = dummy_multiclass_pipeline_class({}) + expected_component_graph = expected_pipeline.component_graph + expected_name = expected_pipeline.name + expected_parameters = expected_pipeline.parameters mock_score.return_value = {automl.objective.name: 1.0} - assert automl.allowed_pipelines == expected_pipelines + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_parameters assert automl.allowed_model_families == [ModelFamily.NONE] automl.search() mock_fit.assert_called() mock_score.assert_called() - assert automl.allowed_pipelines == expected_pipelines + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_parameters assert automl.allowed_model_families == [ModelFamily.NONE] @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_automl_allowed_pipelines_specified_allowed_model_families_binary( +def test_automl_component_graphs_specified_allowed_model_families_binary( mock_fit, mock_score, X_y_binary, assert_allowed_pipelines_equal_helper ): X, y = X_y_binary @@ -765,7 +779,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_binary( X_train=X, y_train=y, problem_type="binary", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -789,7 +803,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_binary( X_train=X, y_train=y, problem_type="binary", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=["random_forest"], ) expected_pipelines = [ @@ -808,7 +822,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_binary( @patch("evalml.pipelines.MulticlassClassificationPipeline.score") @patch("evalml.pipelines.MulticlassClassificationPipeline.fit") -def test_automl_allowed_pipelines_specified_allowed_model_families_multi( +def test_automl_component_graphs_specified_allowed_model_families_multi( mock_fit, mock_score, X_y_multi, assert_allowed_pipelines_equal_helper ): X, y = X_y_multi @@ -816,7 +830,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_multi( X_train=X, y_train=y, problem_type="multiclass", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -840,7 +854,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_multi( X_train=X, y_train=y, problem_type="multiclass", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=["random_forest"], ) expected_pipelines = [ @@ -859,7 +873,7 @@ def test_automl_allowed_pipelines_specified_allowed_model_families_multi( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_automl_allowed_pipelines_init_allowed_both_not_specified_binary( +def test_automl_component_graphs_init_allowed_both_not_specified_binary( mock_fit, mock_score, X_y_binary, assert_allowed_pipelines_equal_helper ): X, y = X_y_binary @@ -867,7 +881,7 @@ def test_automl_allowed_pipelines_init_allowed_both_not_specified_binary( X_train=X, y_train=y, problem_type="binary", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=None, ) mock_score.return_value = {automl.objective.name: 1.0} @@ -888,7 +902,7 @@ def test_automl_allowed_pipelines_init_allowed_both_not_specified_binary( @patch("evalml.pipelines.MulticlassClassificationPipeline.score") @patch("evalml.pipelines.MulticlassClassificationPipeline.fit") -def test_automl_allowed_pipelines_init_allowed_both_not_specified_multi( +def test_automl_component_graphs_init_allowed_both_not_specified_multi( mock_fit, mock_score, X_y_multi, assert_allowed_pipelines_equal_helper ): X, y = X_y_multi @@ -896,7 +910,7 @@ def test_automl_allowed_pipelines_init_allowed_both_not_specified_multi( X_train=X, y_train=y, problem_type="multiclass", - allowed_pipelines=None, + allowed_component_graphs=None, allowed_model_families=None, ) mock_score.return_value = {automl.objective.name: 1.0} @@ -917,9 +931,10 @@ def test_automl_allowed_pipelines_init_allowed_both_not_specified_multi( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_automl_allowed_pipelines_init_allowed_both_specified_binary( +def test_automl_component_graphs_init_allowed_both_specified_binary( mock_fit, mock_score, + dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary, assert_allowed_pipelines_equal_helper, @@ -929,19 +944,23 @@ def test_automl_allowed_pipelines_init_allowed_both_specified_binary( X_train=X, y_train=y, problem_type="binary", - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] }], allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [dummy_binary_pipeline_class({})] - assert automl.allowed_pipelines == expected_pipelines - # the dummy binary pipeline estimator has model family NONE - assert set(automl.allowed_model_families) == set([ModelFamily.NONE]) + expected_pipeline = dummy_binary_pipeline_class({}) + expected_component_graph = expected_pipeline.component_graph + expected_name = expected_pipeline.name + expected_parameters = expected_pipeline.parameters + mock_score.return_value = {automl.objective.name: 1.0} + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_parameters + assert automl.allowed_model_families == [ModelFamily.NONE] automl.search() - assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( - [p.model_family for p in expected_pipelines] + [p.model_family for p in expected_pipeline] ) mock_fit.assert_called() mock_score.assert_called() @@ -949,9 +968,10 @@ def test_automl_allowed_pipelines_init_allowed_both_specified_binary( @patch("evalml.pipelines.MulticlassClassificationPipeline.score") @patch("evalml.pipelines.MulticlassClassificationPipeline.fit") -def test_automl_allowed_pipelines_init_allowed_both_specified_multi( +def test_automl_component_graphs_init_allowed_both_specified_multi( mock_fit, mock_score, + dummy_classifier_estimator_class, dummy_multiclass_pipeline_class, X_y_multi, assert_allowed_pipelines_equal_helper, @@ -961,66 +981,67 @@ def test_automl_allowed_pipelines_init_allowed_both_specified_multi( X_train=X, y_train=y, problem_type="multiclass", - allowed_pipelines=[dummy_multiclass_pipeline_class({})], + allowed_component_graphs=[{"Mock Multiclass Classification Pipeline": [dummy_classifier_estimator_class] }], allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} - expected_pipelines = [dummy_multiclass_pipeline_class({})] - assert automl.allowed_pipelines == expected_pipelines - # the dummy multiclass pipeline estimator has model family NONE - assert set(automl.allowed_model_families) == set([ModelFamily.NONE]) + expected_pipeline = dummy_multiclass_pipeline_class({}) + expected_component_graph = expected_pipeline.component_graph + expected_name = expected_pipeline.name + expected_parameters = expected_pipeline.parameters + mock_score.return_value = {automl.objective.name: 1.0} + assert automl.allowed_pipelines[0].component_graph == expected_component_graph + assert automl.allowed_pipelines[0].name == expected_name + assert automl.allowed_pipelines[0].parameters == expected_parameters + assert automl.allowed_model_families == [ModelFamily.NONE] automl.search() - assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( - [p.model_family for p in expected_pipelines] + [p.model_family for p in expected_pipeline] ) mock_fit.assert_called() mock_score.assert_called() @pytest.mark.parametrize("is_linear", [True, False]) -@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]) +@pytest.mark.parametrize("problem_type", ["binary", "multiclass"]) @patch("evalml.pipelines.MulticlassClassificationPipeline.score") @patch("evalml.pipelines.MulticlassClassificationPipeline.fit") @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_automl_allowed_pipelines_search( +def test_automl_component_graphs_search( mock_binary_fit, mock_binary_score, mock_multi_fit, mock_multi_score, is_linear, problem_type, - dummy_binary_pipeline_class, - nonlinear_binary_pipeline_class, - dummy_multiclass_pipeline_class, - nonlinear_multiclass_pipeline_class, + dummy_binary_linear_component_graph, + dummy_binary_dict_component_graph, + dummy_multiclass_linear_component_graph, + dummy_multiclass_dict_component_graph, X_y_binary, X_y_multi, ): - if problem_type == ProblemTypes.BINARY: + if problem_type == "binary": X, y = X_y_binary mock_binary_score.return_value = {"Log Loss Binary": 1.0} expected_mock_class = BinaryClassificationPipeline - pipeline_class = ( - dummy_binary_pipeline_class + component_graph = ( + dummy_binary_linear_component_graph if is_linear - else nonlinear_binary_pipeline_class + else dummy_binary_dict_component_graph ) else: X, y = X_y_multi mock_multi_score.return_value = {"Log Loss Multiclass": 1.0} expected_mock_class = MulticlassClassificationPipeline - - pipeline_class = ( - dummy_multiclass_pipeline_class + component_graph = ( + dummy_multiclass_linear_component_graph if is_linear - else nonlinear_multiclass_pipeline_class + else dummy_multiclass_dict_component_graph ) - allowed_pipelines = [pipeline_class({})] - start_iteration_callback = MagicMock() automl = AutoMLSearch( X_train=X, @@ -1028,7 +1049,7 @@ def test_automl_allowed_pipelines_search( problem_type=problem_type, max_iterations=5, start_iteration_callback=start_iteration_callback, - allowed_pipelines=allowed_pipelines, + allowed_component_graphs=[component_graph], ) automl.search() @@ -1036,9 +1057,14 @@ def test_automl_allowed_pipelines_search( start_iteration_callback.call_args_list[0][0][0], expected_mock_class ) for i in range(1, 5): - assert isinstance( - start_iteration_callback.call_args_list[i][0][0], pipeline_class - ) + if problem_type == "binary": + assert isinstance( + start_iteration_callback.call_args_list[i][0][0], BinaryClassificationPipeline + ) + elif problem_type == "multiclass": + assert isinstance( + start_iteration_callback.call_args_list[i][0][0], MulticlassClassificationPipeline + ) @pytest.mark.parametrize( diff --git a/evalml/tests/automl_tests/test_automl_search_regression.py b/evalml/tests/automl_tests/test_automl_search_regression.py index 958cb9acf7..e2c25570d5 100644 --- a/evalml/tests/automl_tests/test_automl_search_regression.py +++ b/evalml/tests/automl_tests/test_automl_search_regression.py @@ -215,7 +215,7 @@ def test_log_metrics_only_passed_directly(X_y_regression): assert ar.additional_objectives[1].name == "Mean Squared Log Error" -def test_automl_allowed_pipelines_no_allowed_component_graphs(X_y_regression): +def test_automl_component_graphs_no_allowed_component_graphs(X_y_regression): X, y = X_y_regression with pytest.raises(ValueError, match="No allowed pipelines to search"): AutoMLSearch( @@ -229,7 +229,7 @@ def test_automl_allowed_pipelines_no_allowed_component_graphs(X_y_regression): @patch("evalml.pipelines.RegressionPipeline.score") @patch("evalml.pipelines.RegressionPipeline.fit") -def test_automl_allowed_component_graphs_specified_allowed_pipelines( +def test_automl_allowed_component_graphs_specified_component_graphs( mock_fit, mock_score, dummy_regressor_estimator_class, dummy_regression_pipeline_class, X_y_regression ): X, y = X_y_regression diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 98fc8e9a6f..65654fcc52 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -326,7 +326,8 @@ def test_get_hyperparameter_ranges(): ("regression", "Random Forest Regressor"), ("time series regression", "ARIMA Regressor")]) def test_get_pipelines_from_component_graphs(problem_type, estimator): - component_graphs = [{"Name_0": ["Imputer", estimator]}, + component_graphs = [{"Name_0": ["Imputer", estimator], + "random_seed": 42}, {"Name_1": { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], @@ -337,6 +338,8 @@ def test_get_pipelines_from_component_graphs(problem_type, estimator): get_pipelines_from_component_graphs(component_graphs, problem_type) else: returned_pipelines = get_pipelines_from_component_graphs(component_graphs, problem_type) + assert returned_pipelines[0].random_seed == 42 + assert returned_pipelines[1].random_seed == 0 if problem_type == "binary": assert all(isinstance(pipe_, BinaryClassificationPipeline) for pipe_ in returned_pipelines) elif problem_type == "multiclass": diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index b40fa27e61..17f68ea7d1 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -348,11 +348,67 @@ def fit(self, X, y): @pytest.fixture -def dummy_binary_linear_component_graph(dummy_classifier_estimator_class): +def dummy_binary_estimator_class(): + class MockEstimator(Estimator): + name = "Mock Binary Classifier" + model_family = ModelFamily.NONE + supported_problem_types = [ + ProblemTypes.BINARY, + ProblemTypes.TIME_SERIES_BINARY, + ] + hyperparameter_ranges = {"a": Integer(0, 10), "b": Real(0, 10)} + + def __init__(self, a=1, b=0, random_seed=0): + super().__init__( + parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed + ) + + def fit(self, X, y): + return self + + return MockEstimator + + +@pytest.fixture +def dummy_multiclass_estimator_class(): + class MockEstimator(Estimator): + name = "Mock Multiclass Classifier" + model_family = ModelFamily.NONE + supported_problem_types = [ + ProblemTypes.MULTICLASS, + ProblemTypes.TIME_SERIES_MULTICLASS, + ] + hyperparameter_ranges = {"a": Integer(0, 10), "b": Real(0, 10)} + + def __init__(self, a=1, b=0, random_seed=0): + super().__init__( + parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed + ) + + def fit(self, X, y): + return self + + return MockEstimator + + +@pytest.fixture +def dummy_classifier_linear_component_graph(dummy_classifier_estimator_class): component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class]} return component_graph_linear +@pytest.fixture +def dummy_binary_linear_component_graph(dummy_binary_estimator_class): + component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_binary_estimator_class]} + return component_graph_linear + + +@pytest.fixture +def dummy_multiclass_linear_component_graph(dummy_multiclass_estimator_class): + component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_multiclass_estimator_class]} + return component_graph_linear + + @pytest.fixture def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_regressor_estimator_class]} @@ -360,7 +416,7 @@ def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): @pytest.fixture -def dummy_binary_dict_component_graph(dummy_classifier_estimator_class): +def dummy_classifier_dict_component_graph(dummy_classifier_estimator_class): component_graph_dict = {"Name": { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], @@ -369,6 +425,26 @@ def dummy_binary_dict_component_graph(dummy_classifier_estimator_class): return component_graph_dict +@pytest.fixture +def dummy_binary_dict_component_graph(dummy_binary_estimator_class): + component_graph_dict = {"Name": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_binary_estimator_class, "Imputer_1"], + }} + return component_graph_dict + + +@pytest.fixture +def dummy_multiclass_dict_component_graph(dummy_multiclass_estimator_class): + component_graph_dict = {"Name": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_multiclass_estimator_class, "Imputer_1"], + }} + return component_graph_dict + + @pytest.fixture def dummy_regressor_dict_component_graph(dummy_regressor_estimator_class): component_graph_dict = {"Name": { @@ -412,7 +488,7 @@ def dummy_multiclass_pipeline_class(dummy_classifier_estimator_class): class MockMulticlassClassificationPipeline(MulticlassClassificationPipeline): estimator = MockEstimator component_graph = [MockEstimator] - custom_name = None + custom_name = "Mock Multiclass Classification Pipeline" def __init__(self, parameters, random_seed=0): super().__init__( From e67a3ee42c431dd22396cf9e5e22146e0d479529 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 01:21:09 -0400 Subject: [PATCH 61/85] updates --- evalml/automl/automl_search.py | 2 +- evalml/tests/automl_tests/test_automl.py | 115 +++++++++++----- evalml/tests/automl_tests/test_automl_dask.py | 41 +++++- .../automl_tests/test_iterative_algorithm.py | 127 +++++------------- 4 files changed, 158 insertions(+), 127 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 24b5a0bee8..b9282a0f64 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -488,7 +488,7 @@ def __init__( if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") - logger.info(f"{len(self.allowed_pipelines)} component graphs ready for search.") + logger.info(f"{len(self.allowed_pipelines)} pipelines ready for search.") check_all_pipeline_names_unique(self.allowed_pipelines) run_ensembling = self.ensembling diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index bbc0e9e458..d4606a78bb 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -13,6 +13,7 @@ from skopt.space import Categorical, Integer, Real from evalml import AutoMLSearch +from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.callbacks import ( log_error_callback, raise_error_callback, @@ -64,7 +65,7 @@ is_classification, is_time_series, ) -from evalml.tuners import NoParamsException, RandomSearchTuner +from evalml.tuners import NoParamsException, RandomSearchTuner, SKOptTuner @pytest.mark.parametrize( @@ -994,7 +995,7 @@ def test_default_objective(X_y_binary): @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_add_to_rankings(mock_fit, mock_score, dummy_binary_linear_component_graph, dummy_binary_pipeline_class, X_y_binary): +def test_add_to_rankings(mock_fit, mock_score, dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 1.0} @@ -1003,7 +1004,7 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_linear_component_gra y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_binary_linear_component_graph] + allowed_component_graphs=[dummy_classifier_linear_component_graph] ) automl.search() assert len(automl.rankings) == 1 @@ -1039,7 +1040,7 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_binary_linear_component_gra @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_no_search( - mock_fit, mock_score, dummy_binary_linear_component_graph, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary automl = AutoMLSearch( @@ -1047,7 +1048,7 @@ def test_add_to_rankings_no_search( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_binary_linear_component_graph], + allowed_component_graphs=[dummy_classifier_linear_component_graph], ) mock_score.return_value = {"Log Loss Binary": 0.5234} @@ -1135,7 +1136,7 @@ def test_add_to_rankings_regression( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_duplicate( - mock_fit, mock_score, dummy_binary_linear_component_graph, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 0.1234} @@ -1145,7 +1146,7 @@ def test_add_to_rankings_duplicate( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_binary_linear_component_graph], + allowed_component_graphs=[dummy_classifier_linear_component_graph], ) automl.search() best_pipeline = automl.best_pipeline @@ -1842,9 +1843,6 @@ def test_percent_better_than_baseline_in_rankings( pipeline_scores, baseline_score, problem_type_value, - dummy_classifier_estimator_class, - dummy_regressor_estimator_class, - dummy_time_series_regressor_estimator_class, dummy_binary_pipeline_class, dummy_multiclass_pipeline_class, dummy_regression_pipeline_class, @@ -1857,11 +1855,11 @@ def test_percent_better_than_baseline_in_rankings( # Ok to only use binary labels since score and fit methods are mocked X, y = X_y_binary - estimator_class = { - ProblemTypes.BINARY: dummy_classifier_estimator_class, - ProblemTypes.MULTICLASS: dummy_classifier_estimator_class, - ProblemTypes.REGRESSION: dummy_regressor_estimator_class, - ProblemTypes.TIME_SERIES_REGRESSION: dummy_time_series_regressor_estimator_class, + pipeline_class = { + ProblemTypes.BINARY: dummy_binary_pipeline_class, + ProblemTypes.MULTICLASS: dummy_multiclass_pipeline_class, + ProblemTypes.REGRESSION: dummy_regression_pipeline_class, + ProblemTypes.TIME_SERIES_REGRESSION: dummy_time_series_regression_pipeline_class, }[problem_type_value] baseline_pipeline_class = { ProblemTypes.BINARY: "evalml.pipelines.BinaryClassificationPipeline", @@ -1870,15 +1868,35 @@ def test_percent_better_than_baseline_in_rankings( ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline", }[problem_type_value] - created_pipelines = get_pipelines_from_component_graphs(component_graphs_list=[{"Pipeline1": [estimator_class]}, - {"Pipeline2": [estimator_class]}], - problem_type=problem_type_value) + + class DummyPipeline(pipeline_class): + problem_type = problem_type_value + + def __init__(self, parameters, random_seed=0): + super().__init__(parameters=parameters) + + def new(self, parameters, random_seed=0): + return self.__class__(parameters, random_seed=random_seed) + + def clone(self): + return self.__class__(self.parameters, random_seed=self.random_seed) + + def fit(self, *args, **kwargs): + """Mocking fit""" + + class Pipeline1(DummyPipeline): + custom_name = "Pipeline1" + + class Pipeline2(DummyPipeline): + custom_name = "Pipeline2" + mock_score_1 = MagicMock(return_value={objective.name: pipeline_scores[0]}) mock_score_2 = MagicMock(return_value={objective.name: pipeline_scores[1]}) - created_pipelines[0].score = mock_score_1 - created_pipelines[1].score = mock_score_2 - print('##################################') - print(created_pipelines[0].score) + Pipeline1.score = mock_score_1 + Pipeline2.score = mock_score_2 + + pipeline_parameters = {"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}} if problem_type_value == ProblemTypes.TIME_SERIES_REGRESSION else {} + allowed_pipelines = [Pipeline1(pipeline_parameters), Pipeline2(pipeline_parameters)] if objective.name.lower() == "cost benefit matrix": automl = AutoMLSearch( @@ -1897,10 +1915,6 @@ def test_percent_better_than_baseline_in_rankings( y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_component_graphs=[ - Pipeline1({"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}}), - Pipeline2({"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}}), - ], objective=objective, additional_objectives=[], problem_configuration={"date_index": None, "gap": 0, "max_delay": 0}, @@ -1913,14 +1927,24 @@ def test_percent_better_than_baseline_in_rankings( y_train=y, problem_type=problem_type_value, max_iterations=3, - allowed_component_graphs=[{"Pipeline1": [estimator_class]}, - {"Pipeline2": [estimator_class]}], objective=objective, additional_objectives=[], optimize_thresholds=False, n_jobs=1, ) - #automl.allowed_pipelines = created_pipelines + automl._automl_algorithm = IterativeAlgorithm( + max_iterations=2, + allowed_pipelines=allowed_pipelines, + tuner_class=SKOptTuner, + random_seed=0, + n_jobs=1, + number_features=X.shape[1], + pipelines_per_batch=5, + ensembling=False, + text_in_ensembling=False, + pipeline_params=pipeline_parameters, + custom_hyperparameters=None, + ) with patch( baseline_pipeline_class + ".score", @@ -1963,8 +1987,6 @@ def test_percent_better_than_baseline_in_rankings( 2, ), } - print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') - print(scores) for name in answers: np.testing.assert_almost_equal(scores[name], answers[name], decimal=3) @@ -2062,11 +2084,23 @@ def fit(self, *args, **kwargs): y_train=y, problem_type=problem_type, max_iterations=2, - allowed_component_graphs=[{"Name_0": ["Imputer", "ARIMA Regressor"]}], objective="auto", problem_configuration={"date_index": None, "gap": 1, "max_delay": 1}, additional_objectives=additional_objectives, ) + automl._automl_algorithm = IterativeAlgorithm( + max_iterations=2, + allowed_pipelines=[DummyPipeline(parameters)], + tuner_class=SKOptTuner, + random_seed=0, + n_jobs=-1, + number_features=X.shape[1], + pipelines_per_batch=5, + ensembling=False, + text_in_ensembling=False, + pipeline_params={"pipeline": {"date_index": None, "gap": 1, "max_delay": 1}}, + custom_hyperparameters=None, + ) with patch(baseline_pipeline_class + ".score", return_value=mock_baseline_scores): automl.search() @@ -2092,7 +2126,7 @@ def fit(self, *args, **kwargs): def test_time_series_regression_with_parameters(ts_data): X, y = ts_data X.index.name = "Date" - problem_configuration = {"date_index": "ssate", "gap": 1, "max_delay": 0} + problem_configuration = {"date_index": "Date", "gap": 1, "max_delay": 0} automl = AutoMLSearch( X_train=X, y_train=y, @@ -2103,8 +2137,6 @@ def test_time_series_regression_with_parameters(ts_data): max_batches=3 ) automl.search() - print("--------------------------") - print(automl.allowed_pipelines[0].parameters) assert automl.allowed_pipelines[0].parameters["pipeline"] == problem_configuration @@ -2152,6 +2184,19 @@ def clone(self): objective="log loss binary", additional_objectives=["f1"], ) + automl._automl_algorithm = IterativeAlgorithm( + max_iterations=2, + allowed_pipelines=[DummyPipeline({})], + tuner_class=SKOptTuner, + random_seed=0, + n_jobs=-1, + number_features=X.shape[1], + pipelines_per_batch=5, + ensembling=False, + text_in_ensembling=False, + pipeline_params={}, + custom_hyperparameters=None, + ) automl.search() assert ( diff --git a/evalml/tests/automl_tests/test_automl_dask.py b/evalml/tests/automl_tests/test_automl_dask.py index 74f49c213c..ab9f8029b3 100644 --- a/evalml/tests/automl_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/test_automl_dask.py @@ -5,14 +5,17 @@ from distributed import Client from evalml.automl import AutoMLSearch +from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.callbacks import raise_error_callback from evalml.automl.engine import DaskEngine, SequentialEngine +from evalml.automl.utils import get_pipelines_from_component_graphs from evalml.tests.automl_tests.dask_test_utils import ( TestPipelineFast, TestPipelineSlow, TestPipelineWithFitError, TestPipelineWithScoreError, ) +from evalml.tuners import SKOptTuner @pytest.mark.usefixtures("X_y_binary_cls") @@ -97,7 +100,19 @@ def test_automl_train_dask_error_callback(self): problem_type="binary", engine=self.parallel_engine, max_iterations=2, + ) + automl._automl_algorithm = IterativeAlgorithm( + max_iterations=2, allowed_pipelines=pipelines, + tuner_class=SKOptTuner, + random_seed=0, + n_jobs=-1, + number_features=X.shape[1], + pipelines_per_batch=5, + ensembling=False, + text_in_ensembling=False, + pipeline_params={}, + custom_hyperparameters=None, ) automl.train_pipelines(pipelines) assert "Train error for PipelineWithError: Yikes" in self._caplog.text @@ -113,7 +128,19 @@ def test_automl_score_dask_error_callback(self): problem_type="binary", engine=self.parallel_engine, max_iterations=2, + ) + automl._automl_algorithm = IterativeAlgorithm( + max_iterations=2, allowed_pipelines=pipelines, + tuner_class=SKOptTuner, + random_seed=0, + n_jobs=-1, + number_features=X.shape[1], + pipelines_per_batch=5, + ensembling=False, + text_in_ensembling=False, + pipeline_params={}, + custom_hyperparameters=None, ) automl.score_pipelines( pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"] @@ -135,10 +162,22 @@ def test_automl_immediate_quit(self): problem_type="binary", engine=self.parallel_engine, max_iterations=4, - allowed_pipelines=pipelines, error_callback=raise_error_callback, optimize_thresholds=False, ) + automl._automl_algorithm = IterativeAlgorithm( + max_iterations=4, + allowed_pipelines=pipelines, + tuner_class=SKOptTuner, + random_seed=0, + n_jobs=-1, + number_features=X.shape[1], + pipelines_per_batch=5, + ensembling=False, + text_in_ensembling=False, + pipeline_params={}, + custom_hyperparameters=None, + ) # Ensure the broken pipeline raises the error with pytest.raises(Exception, match="Yikes"): diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index e2d707a169..e2fed99ffb 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -4,6 +4,7 @@ import pytest from skopt.space import Categorical, Integer, Real +from evalml import AutoMLSearch from evalml.automl.automl_algorithm import ( AutoMLAlgorithmException, IterativeAlgorithm, @@ -413,8 +414,31 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_regression( "parameters", [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Integer(2, 4), Real(2, 6)], ) -def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_classes): +def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_classes, X_y_binary, dummy_classifier_estimator_class): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes(parameters) + + class MockEstimator(Estimator): + name = "Mock Classifier" + model_family = ModelFamily.RANDOM_FOREST + supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] + if isinstance(hyperparameters, (list, tuple, Real, Categorical, Integer)): + hyperparameter_ranges = {"dummy_parameter": hyperparameters} + else: + hyperparameter_ranges = {"dummy_parameter": [hyperparameters]} + + def __init__( + self, dummy_parameter="default", n_jobs=-1, random_seed=0, **kwargs + ): + super().__init__( + parameters={ + "dummy_parameter": dummy_parameter, + **kwargs, + "n_jobs": n_jobs, + }, + component_obj=None, + random_seed=random_seed, + ) + if isinstance(parameters, (Categorical, Integer, Real)): with pytest.raises( ValueError, @@ -430,6 +454,18 @@ def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_c ) return else: + X, y = X_y_binary + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + max_iterations=3, + allowed_component_graphs=[{"Mock Classifier": [dummy_classifier_estimator_class]}], + pipeline_parameters={ + "Mock Classifier": {"dummy_parameter": parameters}, + }, + problem_configuration={"gap": 2, "max_delay": 10}, + ) algo = IterativeAlgorithm( allowed_pipelines=dummy_binary_pipeline_classes, random_seed=0, @@ -545,94 +581,6 @@ def test_iterative_algorithm_custom_hyperparameters( assert all_dummies == {1, 3, 4} if parameters == 1 else all_dummies == {2, 3, 4} -def test_iterative_algorithm_frozen_parameters(): - class MockEstimator(Estimator): - name = "Mock Classifier" - model_family = ModelFamily.RANDOM_FOREST - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] - hyperparameter_ranges = { - "dummy_int_parameter": Integer(1, 10), - "dummy_categorical_parameter": Categorical(["random", "dummy", "test"]), - "dummy_real_parameter": Real(0, 1), - } - - def __init__( - self, - dummy_int_parameter=0, - dummy_categorical_parameter="dummy", - dummy_real_parameter=1.0, - n_jobs=-1, - random_seed=0, - **kwargs - ): - super().__init__( - parameters={ - "dummy_int_parameter": dummy_int_parameter, - "dummy_categorical_parameter": dummy_categorical_parameter, - "dummy_real_parameter": dummy_real_parameter, - **kwargs, - "n_jobs": n_jobs, - }, - component_obj=None, - random_seed=random_seed, - ) - - pipeline = BinaryClassificationPipeline([MockEstimator]) - algo = IterativeAlgorithm( - allowed_pipelines=[pipeline, pipeline, pipeline], - pipeline_params={"pipeline": {"date_index": "Date", "gap": 2, "max_delay": 10}}, - random_seed=0, - _frozen_pipeline_parameters={ - "Mock Classifier": { - "dummy_int_parameter": 6, - "dummy_categorical_parameter": "random", - "dummy_real_parameter": 0.1, - } - }, - ) - - next_batch = algo.next_batch() - assert all( - [ - p.parameters["pipeline"] - == {"date_index": "Date", "gap": 2, "max_delay": 10} - for p in next_batch - ] - ) - assert all( - [ - p.parameters["Mock Classifier"] - == { - "dummy_int_parameter": 6, - "dummy_categorical_parameter": "random", - "dummy_real_parameter": 0.1, - "n_jobs": -1, - } - for p in next_batch - ] - ) - - scores = np.arange(0, len(next_batch)) - for score, pipeline in zip(scores, next_batch): - algo.add_result(score, pipeline, {"id": algo.pipeline_number}) - - # make sure that future batches remain in the hyperparam range - for i in range(1, 5): - next_batch = algo.next_batch() - assert all( - [ - p.parameters["Mock Classifier"] - == { - "dummy_int_parameter": 6, - "dummy_categorical_parameter": "random", - "dummy_real_parameter": 0.1, - "n_jobs": -1, - } - for p in next_batch - ] - ) - - def test_iterative_algorithm_pipeline_params_kwargs(dummy_binary_pipeline_classes): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes() algo = IterativeAlgorithm( @@ -815,7 +763,6 @@ def test_iterative_algorithm_sampling_params( algo = IterativeAlgorithm( allowed_pipelines=pipelines, random_seed=0, - _frozen_pipeline_parameters={sampler: {"sampling_ratio": 0.5}}, ) next_batch = algo.next_batch() for p in next_batch: From 5921047ca696ac681f872f906c49da7a680219fb Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 03:14:49 -0400 Subject: [PATCH 62/85] lint fixes and add check for unique names --- docs/source/release_notes.rst | 2 + .../automl_algorithm/iterative_algorithm.py | 23 +- evalml/automl/automl_search.py | 20 +- evalml/automl/utils.py | 27 +- evalml/tests/automl_tests/test_automl.py | 259 +++++++++++++----- evalml/tests/automl_tests/test_automl_dask.py | 1 - .../test_automl_search_classification.py | 42 ++- .../test_automl_search_regression.py | 22 +- .../tests/automl_tests/test_automl_utils.py | 56 ++-- .../automl_tests/test_iterative_algorithm.py | 50 +--- evalml/tests/conftest.py | 66 +++-- 11 files changed, 369 insertions(+), 199 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index f5d86f9c42..1c92b53dd8 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -4,12 +4,14 @@ Release Notes * Enhancements * Fixes * Changes + * Replaced `allowed_pipelines` with `allowed_component_graphs` :pr:`2364` * Documentation Changes * Testing Changes .. warning:: **Breaking Changes** + * `AutoMLSearch` will accept `allowed_component_graphs` instead of `allowed_pipelines` :pr:`2364` **v0.26.0 Jun. 08, 2021** diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index ec8a741e65..3b5fb09b87 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -172,13 +172,7 @@ def next_batch(self): pipeline = self._first_batch_results[idx][1] for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() - print( - f"iterativealgorothm - next_batch - proposed_parameters: {proposed_parameters}" - ) parameters = self._combine_parameters(pipeline, proposed_parameters) - print( - f"iterativealgorothm - next_batch - parameters: {parameters}" - ) next_batch.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) @@ -188,12 +182,7 @@ def next_batch(self): def _combine_parameters(self, pipeline, proposed_parameters): """Helper function for logic to transform proposed parameters.""" - print(f"iterative algorithm - _combine_parameters - proposed_parameters: {proposed_parameters}") - print(f"iterative algorithm - _combine_parameters - pipeline: {pipeline.parameters}") - print("----------------------------") - return { - **self._transform_parameters(pipeline, proposed_parameters) - } + return {**self._transform_parameters(pipeline, proposed_parameters)} def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline @@ -252,15 +241,9 @@ def _transform_parameters(self, pipeline, proposed_parameters): for name, component_class in pipeline.linearized_component_graph: component_parameters = proposed_parameters.get(name, {}) init_params = inspect.signature(component_class.__init__).parameters - print( - f"iterativealgorothm - _transform_parameters - init_params: {init_params}" - ) # For first batch, pass the pipeline params to the components that need them if name in self._custom_hyperparameters and self._batch_number == 0: for param_name, value in self._custom_hyperparameters[name].items(): - print( - f"iterativealgorothm - _transform_parameters - hyperparameter name/param_name/value: {name}/{param_name}/{value}" - ) if isinstance(value, (Integer, Real)): # get a random value in the space component_parameters[param_name] = value.rvs( @@ -274,9 +257,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): - print( - f"iterativealgorothm - _transform_parameters - pipeline name/param_name/value: {name}/{param_name}/{value}" - ) if isinstance(value, (Integer, Real, Categorical)): raise ValueError( "Pipeline parameters should not contain skopt.Space variables, please pass them " @@ -300,5 +280,4 @@ def _transform_parameters(self, pipeline, proposed_parameters): if param_name in init_params: component_parameters[param_name] = value parameters[name] = component_parameters - print(f"iterative algorithm - transform parameters - parameters: {parameters}") return parameters diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index b9282a0f64..191530c7d1 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -20,7 +20,8 @@ check_all_pipeline_names_unique, get_best_sampler_for_data, get_default_primary_search_objective, - make_data_splitter, get_pipelines_from_component_graphs, + make_data_splitter, + get_pipelines_from_component_graphs, ) from evalml.data_checks import DefaultDataChecks from evalml.exceptions import ( @@ -391,6 +392,13 @@ def __init__( raise ValueError( "Every component graph passed must be of type dictionary!" ) + unique_names = set() + for graph in allowed_component_graphs: + unique_names.add(list(graph.keys())[0]) + if len(unique_names) < len(allowed_component_graphs): + raise ValueError( + "Every name of allowed_component_graphs must be unique!" + ) self.allowed_component_graphs = allowed_component_graphs self.allowed_model_families = allowed_model_families self._automl_algorithm = None @@ -466,9 +474,7 @@ def __init__( ) index_columns = list(self.X_train.ww.select("index").columns) if len(index_columns) > 0 and drop_columns is None: - parameters["Drop Columns Transformer"] = { - "columns": index_columns - } + parameters["Drop Columns Transformer"] = {"columns": index_columns} self.allowed_pipelines = [ make_pipeline( self.X_train, @@ -481,9 +487,9 @@ def __init__( for estimator in allowed_estimators ] else: - self.allowed_pipelines = get_pipelines_from_component_graphs(self.allowed_component_graphs, - self.problem_type, - parameters) + self.allowed_pipelines = get_pipelines_from_component_graphs( + self.allowed_component_graphs, self.problem_type, parameters + ) if self.allowed_pipelines == []: raise ValueError("No allowed pipelines to search") diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 094a441bf1..8adf2f5e89 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -5,9 +5,15 @@ from sklearn.model_selection import KFold, StratifiedKFold from evalml.objectives import get_objective -from evalml.pipelines import ComponentGraph, BinaryClassificationPipeline, MulticlassClassificationPipeline, \ - RegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, \ - TimeSeriesRegressionPipeline +from evalml.pipelines import ( + ComponentGraph, + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, + TimeSeriesBinaryClassificationPipeline, + TimeSeriesMulticlassClassificationPipeline, + TimeSeriesRegressionPipeline, +) from evalml.preprocessing.data_splitters import ( TimeSeriesSplit, TrainingValidationSplit, @@ -223,7 +229,9 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): return hyperparameter_ranges -def get_pipelines_from_component_graphs(component_graphs_list, problem_type, parameters=None): +def get_pipelines_from_component_graphs( + component_graphs_list, problem_type, parameters=None +): """ Returns created pipelines from passed component graphs based on the specified problem type. @@ -241,7 +249,7 @@ def get_pipelines_from_component_graphs(component_graphs_list, problem_type, par ProblemTypes.REGRESSION: RegressionPipeline, ProblemTypes.TIME_SERIES_BINARY: TimeSeriesBinaryClassificationPipeline, ProblemTypes.TIME_SERIES_MULTICLASS: TimeSeriesMulticlassClassificationPipeline, - ProblemTypes.TIME_SERIES_REGRESSION: TimeSeriesRegressionPipeline + ProblemTypes.TIME_SERIES_REGRESSION: TimeSeriesRegressionPipeline, }[handle_problem_types(problem_type)] created_pipelines = [] for component_graph in component_graphs_list: @@ -250,5 +258,12 @@ def get_pipelines_from_component_graphs(component_graphs_list, problem_type, par comp_seed = component_graph.pop("random_seed") comp_name = next(iter(component_graph)) comp_graph = component_graph[comp_name] - created_pipelines.append(pipeline_class(component_graph=comp_graph, parameters=parameters, custom_name=comp_name, random_seed=comp_seed)) + created_pipelines.append( + pipeline_class( + component_graph=comp_graph, + parameters=parameters, + custom_name=comp_name, + random_seed=comp_seed, + ) + ) return created_pipelines diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index d4606a78bb..cd006e2a5e 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -22,7 +22,8 @@ from evalml.automl.utils import ( _LARGE_DATA_PERCENT_VALIDATION, _LARGE_DATA_ROW_THRESHOLD, - get_default_primary_search_objective, get_pipelines_from_component_graphs, + get_default_primary_search_objective, + get_pipelines_from_component_graphs, ) from evalml.demos import load_breast_cancer, load_wine from evalml.exceptions import ( @@ -513,8 +514,12 @@ def test_automl_feature_selection(mock_fit, mock_score, X_y_binary): max_iterations=2, start_iteration_callback=start_iteration_callback, allowed_component_graphs=[ - {"Name": ["RF Classifier Select From Model", - "Logistic Regression Classifier"]}, + { + "Name": [ + "RF Classifier Select From Model", + "Logistic Regression Classifier", + ] + }, ], ) automl.search() @@ -572,12 +577,17 @@ def test_automl_algorithm(mock_fit, mock_score, mock_algo_next_batch, X_y_binary @patch("evalml.automl.automl_algorithm.IterativeAlgorithm.__init__") def test_automl_allowed_component_graphs_algorithm( - mock_algo_init, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary + mock_algo_init, + dummy_classifier_estimator_class, + dummy_binary_pipeline_class, + X_y_binary, ): mock_algo_init.side_effect = Exception("mock algo init") X, y = X_y_binary - allowed_component_graphs = [{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}] + allowed_component_graphs = [ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ] with pytest.raises(Exception, match="mock algo init"): AutoMLSearch( X_train=X, @@ -589,7 +599,9 @@ def test_automl_allowed_component_graphs_algorithm( assert mock_algo_init.call_count == 1 _, kwargs = mock_algo_init.call_args assert kwargs["max_iterations"] == 10 - assert kwargs["allowed_pipelines"] == get_pipelines_from_component_graphs(allowed_component_graphs, "binary") + assert kwargs["allowed_pipelines"] == get_pipelines_from_component_graphs( + allowed_component_graphs, "binary" + ) allowed_model_families = [ModelFamily.RANDOM_FOREST] with pytest.raises(Exception, match="mock algo init"): @@ -894,10 +906,33 @@ def test_component_graph_with_incorrect_problem_type( X_train=X, y_train=y, problem_type="regression", - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], + allowed_component_graphs=[ + { + "Mock Binary Classification Pipeline": [ + dummy_classifier_estimator_class + ] + } + ], + ) + + +def test_component_graph_with_nonunique_names(X_y_binary, dummy_classifier_estimator_class): + X, y = X_y_binary + + with pytest.raises(ValueError, match="Every name of allowed_component_graphs must be unique!"): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_component_graphs=[ + {"Name_0": [dummy_classifier_estimator_class]}, + {"Name_1": [dummy_classifier_estimator_class]}, + {"Name_0": [dummy_classifier_estimator_class]} + ], ) + def test_main_objective_problem_type_mismatch(X_y_binary): X, y = X_y_binary with pytest.raises(ValueError, match="is not compatible with a"): @@ -995,7 +1030,13 @@ def test_default_objective(X_y_binary): @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") -def test_add_to_rankings(mock_fit, mock_score, dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary): +def test_add_to_rankings( + mock_fit, + mock_score, + dummy_classifier_linear_component_graph, + dummy_binary_pipeline_class, + X_y_binary, +): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 1.0} @@ -1004,7 +1045,7 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_classifier_linear_component y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_classifier_linear_component_graph] + allowed_component_graphs=[dummy_classifier_linear_component_graph], ) automl.search() assert len(automl.rankings) == 1 @@ -1040,7 +1081,11 @@ def test_add_to_rankings(mock_fit, mock_score, dummy_classifier_linear_component @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_no_search( - mock_fit, mock_score, dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary + mock_fit, + mock_score, + dummy_classifier_linear_component_graph, + dummy_binary_pipeline_class, + X_y_binary, ): X, y = X_y_binary automl = AutoMLSearch( @@ -1072,7 +1117,9 @@ def test_add_to_rankings_no_search( @patch("evalml.pipelines.RegressionPipeline.score") -def test_add_to_rankings_regression_large(mock_score, dummy_regressor_linear_component_graph, dummy_regression_pipeline_class): +def test_add_to_rankings_regression_large( + mock_score, dummy_regressor_linear_component_graph, dummy_regression_pipeline_class +): X = pd.DataFrame({"col_0": [i for i in range(101000)]}) y = pd.Series([i for i in range(101000)]) @@ -1112,7 +1159,10 @@ def test_add_to_rankings_new_pipeline(dummy_regression_pipeline_class): @patch("evalml.pipelines.RegressionPipeline.score") def test_add_to_rankings_regression( - mock_score, dummy_regressor_linear_component_graph, dummy_regression_pipeline_class, X_y_regression + mock_score, + dummy_regressor_linear_component_graph, + dummy_regression_pipeline_class, + X_y_regression, ): X, y = X_y_regression @@ -1136,7 +1186,11 @@ def test_add_to_rankings_regression( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_duplicate( - mock_fit, mock_score, dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary + mock_fit, + mock_score, + dummy_classifier_linear_component_graph, + dummy_binary_pipeline_class, + X_y_binary, ): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 0.1234} @@ -1161,7 +1215,11 @@ def test_add_to_rankings_duplicate( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_add_to_rankings_trained( - mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary + mock_fit, + mock_score, + dummy_classifier_estimator_class, + dummy_binary_pipeline_class, + X_y_binary, ): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 1.0} @@ -1176,7 +1234,8 @@ class CoolBinaryClassificationPipeline(dummy_binary_pipeline_class): max_iterations=1, allowed_component_graphs=[ {"Cool Binary Classification Pipeline": [dummy_classifier_estimator_class]}, - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, + ], ) automl.search() assert len(automl.rankings) == 1 @@ -1868,7 +1927,6 @@ def test_percent_better_than_baseline_in_rankings( ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline", }[problem_type_value] - class DummyPipeline(pipeline_class): problem_type = problem_type_value @@ -1895,7 +1953,11 @@ class Pipeline2(DummyPipeline): Pipeline1.score = mock_score_1 Pipeline2.score = mock_score_2 - pipeline_parameters = {"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}} if problem_type_value == ProblemTypes.TIME_SERIES_REGRESSION else {} + pipeline_parameters = ( + {"pipeline": {"date_index": None, "gap": 0, "max_delay": 0}} + if problem_type_value == ProblemTypes.TIME_SERIES_REGRESSION + else {} + ) allowed_pipelines = [Pipeline1(pipeline_parameters), Pipeline2(pipeline_parameters)] if objective.name.lower() == "cost benefit matrix": @@ -2134,7 +2196,7 @@ def test_time_series_regression_with_parameters(ts_data): allowed_component_graphs=[{"Name_0": ["Imputer", "ARIMA Regressor"]}], objective="auto", problem_configuration=problem_configuration, - max_batches=3 + max_batches=3, ) automl.search() assert automl.allowed_pipelines[0].parameters["pipeline"] == problem_configuration @@ -2443,10 +2505,14 @@ def test_automl_one_allowed_component_graph_ensembling_disabled( caplog.clear() max_iterations = _get_first_stacked_classifier_no([ModelFamily.LINEAR_MODEL]) + 1 - allowed_component_graph = {"Logistic Regression Binary Pipeline": ["Imputer", - "One Hot Encoder", - "Standard Scaler", - "Logistic Regression Classifier",]} + allowed_component_graph = { + "Logistic Regression Binary Pipeline": [ + "Imputer", + "One Hot Encoder", + "Standard Scaler", + "Logistic Regression Classifier", + ] + } automl = AutoMLSearch( X_train=X, y_train=y, @@ -2710,7 +2776,7 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors estimators = get_estimators("multiclass", [ModelFamily.EXTRA_TREES]) component_graphs = [ - {f'CG_{ind}': [estimator]} for ind, estimator in enumerate(estimators) + {f"CG_{ind}": [estimator]} for ind, estimator in enumerate(estimators) ] automl = AutoMLSearch( X_train=X, @@ -2728,9 +2794,7 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors assert "Default parameters for components" not in str(error.value) -@pytest.mark.parametrize( - "component_graphs", [True, False] -) +@pytest.mark.parametrize("component_graphs", [True, False]) @pytest.mark.parametrize("automl_parameters", [True, False]) @pytest.mark.parametrize("custom_hyperparameters", [True, False]) @patch( @@ -2755,9 +2819,14 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( if component_graphs: component_graph_ = [ - {"Name_0": ["Drop Columns Transformer", - "Imputer", - "Random Forest Classifier",]}] + { + "Name_0": [ + "Drop Columns Transformer", + "Imputer", + "Random Forest Classifier", + ] + } + ] if automl_parameters: automl_parameters_ = { @@ -2843,7 +2912,11 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( ) @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_iterative_algorithm_passes_njobs_to_pipelines( - mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary + mock_fit, + mock_score, + dummy_classifier_estimator_class, + dummy_binary_pipeline_class, + X_y_binary, ): X, y = X_y_binary @@ -2869,7 +2942,7 @@ def __init__(self, n_jobs=-1, random_seed=0): allowed_component_graphs=[ {"Pipeline 1": [MockEstimatorWithNJobs]}, {"Pipeline 2": [MockEstimatorWithNJobs]}, - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, ], ) automl.search() @@ -3064,8 +3137,9 @@ def test_automl_respects_random_seed( X_train=X, y_train=y, problem_type="binary", - allowed_component_graphs=[{"Name_0": [dummy_classifier_estimator_class], - "random_seed": 0}], + allowed_component_graphs=[ + {"Name_0": [dummy_classifier_estimator_class], "random_seed": 0} + ], random_seed=42, max_iterations=10, ) @@ -3551,17 +3625,16 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components( "Imputer_1": ["Imputer", "Imputer"], "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"], } - pipeline_dict = BinaryClassificationPipeline( - component_graph_dict, custom_name="Pipeline from dict" - ) - component_graph_linear = ["Imputer", "Imputer", "Random Forest Classifier"] + automl = AutoMLSearch( X, y, problem_type="binary", - allowed_component_graphs=[{"Pipeline from dict": component_graph_dict}, - {"Pipeline from linear": ["Imputer", "Imputer", "Random Forest Classifier"]}], + allowed_component_graphs=[ + {"Pipeline from dict": component_graph_dict}, + {"Pipeline from linear": component_graph_linear}, + ], pipeline_parameters={ "Imputer": {"numeric_impute_strategy": "most_frequent"}, "Imputer_1": {"numeric_impute_strategy": "median"}, @@ -3588,8 +3661,10 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components( X, y, problem_type="binary", - allowed_component_graphs=[{"Pipeline from dict": component_graph_dict}, - {"Pipeline from linear": component_graph_linear}], + allowed_component_graphs=[ + {"Pipeline from dict": component_graph_dict}, + {"Pipeline from linear": component_graph_linear}, + ], pipeline_parameters={ "One Hot Encoder": {"top_n": 15}, "One Hot Encoder_1": {"top_n": 25}, @@ -3621,7 +3696,9 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen }, "Random Forest Classifier": {"n_estimators": Categorical([100, 125])}, } - component_graph = {"Name_linear": ["Imputer", "Imputer", "Random Forest Classifier"]} + component_graph = { + "Name_linear": ["Imputer", "Imputer", "Random Forest Classifier"] + } else: custom_hyperparameters = { "Imputer": { @@ -3630,11 +3707,13 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen "Imputer_1": {"numeric_impute_strategy": Categorical(["median", "mean"])}, "Random Forest Classifier": {"n_estimators": Categorical([50, 100])}, } - component_graph = {"Name_dict": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"], - }} + component_graph = { + "Name_dict": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": ["Random Forest Classifier", "Imputer_1"], + } + } automl = AutoMLSearch( X, @@ -3645,6 +3724,7 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen max_batches=5, ) from pprint import pp + automl.search() for i, row in automl.full_rankings.iterrows(): if "Mode Baseline Binary" in row["pipeline_name"]: @@ -3686,20 +3766,40 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams( ): X, y = X_y_binary - component_graph_0 = {"Pipe Line One": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], - "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"], - }} - component_graph_1 = {"Pipe Line Two": ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"]} - component_graph_2 = {"Pipe Line Three": ["Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier"]} + component_graph_0 = { + "Pipe Line One": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], + "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"], + } + } + component_graph_1 = { + "Pipe Line Two": [ + "Imputer", + "Imputer", + "One Hot Encoder", + "Random Forest Classifier", + ] + } + component_graph_2 = { + "Pipe Line Three": [ + "Imputer", + "Imputer", + "One Hot Encoder", + "Random Forest Classifier", + ] + } automl = AutoMLSearch( X, y, problem_type="binary", - allowed_component_graphs=[component_graph_0, component_graph_1, component_graph_2], + allowed_component_graphs=[ + component_graph_0, + component_graph_1, + component_graph_2, + ], pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}}, custom_hyperparameters={ "One Hot Encoder": {"top_n": Categorical([12, 10])}, @@ -3831,7 +3931,9 @@ def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y = X_y_binary component_graph_0 = {"Custom Pipeline": ["Imputer", "Random Forest Classifier"]} - component_graph_1 = {"Custom Pipeline": ["Imputer", "Logistic Regression Classifier"]} + component_graph_1 = { + "Custom Pipeline": ["Imputer", "Logistic Regression Classifier"] + } component_graph_2 = {"My Pipeline 3": ["Logistic Regression Classifier"]} component_graph_3 = {"My Pipeline 3": ["Random Forest Classifier"]} @@ -3843,7 +3945,11 @@ def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y, problem_type="binary", - allowed_component_graphs=[component_graph_0, component_graph_1, component_graph_2], + allowed_component_graphs=[ + component_graph_0, + component_graph_1, + component_graph_2, + ], ) with pytest.raises( @@ -3854,7 +3960,12 @@ def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y, problem_type="binary", - allowed_component_graphs=[component_graph_0, component_graph_1, component_graph_2, component_graph_3], + allowed_component_graphs=[ + component_graph_0, + component_graph_1, + component_graph_2, + component_graph_3, + ], ) @@ -4095,7 +4206,9 @@ def test_score_batch_works( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ], ) def make_pipeline_name(index): @@ -4160,7 +4273,9 @@ class Pipeline2(dummy_binary_pipeline_class): y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ], ) with pytest.raises( @@ -4186,7 +4301,9 @@ def test_score_batch_before_fitting_yields_error_nan_scores( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ], ) scored_pipelines = automl.score_pipelines( @@ -4334,10 +4451,18 @@ def test_automl_validates_data_passed_in_to_allowed_component_graphs( X, y = X_y_binary with pytest.raises( - ValueError, match="Parameter allowed_component_graphs must be either None or a list!" + ValueError, + match="Parameter allowed_component_graphs must be either None or a list!", ): AutoMLSearch( - X, y, problem_type="binary", allowed_component_graphs={"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + X, + y, + problem_type="binary", + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [ + dummy_classifier_estimator_class + ] + }, ) with pytest.raises( @@ -4350,7 +4475,11 @@ def test_automl_validates_data_passed_in_to_allowed_component_graphs( problem_type="binary", allowed_component_graphs=[ "Mock Binary Classification Pipeline", - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, + { + "Mock Binary Classification Pipeline": [ + dummy_classifier_estimator_class + ] + }, ], ) diff --git a/evalml/tests/automl_tests/test_automl_dask.py b/evalml/tests/automl_tests/test_automl_dask.py index ab9f8029b3..9ce0473fd9 100644 --- a/evalml/tests/automl_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/test_automl_dask.py @@ -8,7 +8,6 @@ from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.callbacks import raise_error_callback from evalml.automl.engine import DaskEngine, SequentialEngine -from evalml.automl.utils import get_pipelines_from_component_graphs from evalml.tests.automl_tests.dask_test_utils import ( TestPipelineFast, TestPipelineSlow, diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py index 72bc08c837..2e68bdcf98 100644 --- a/evalml/tests/automl_tests/test_automl_search_classification.py +++ b/evalml/tests/automl_tests/test_automl_search_classification.py @@ -708,14 +708,20 @@ def test_automl_allowed_component_graphs_no_component_graphs( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_automl_component_graphs_specified_component_graphs_binary( - mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary + mock_fit, + mock_score, + dummy_classifier_estimator_class, + dummy_binary_pipeline_class, + X_y_binary, ): X, y = X_y_binary automl = AutoMLSearch( X_train=X, y_train=y, problem_type="binary", - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] }], + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ], allowed_model_families=None, ) expected_pipeline = dummy_binary_pipeline_class({}) @@ -740,14 +746,24 @@ def test_automl_component_graphs_specified_component_graphs_binary( @patch("evalml.pipelines.MulticlassClassificationPipeline.score") @patch("evalml.pipelines.MulticlassClassificationPipeline.fit") def test_automl_component_graphs_specified_component_graphs_multi( - mock_fit, mock_score, dummy_classifier_estimator_class, dummy_multiclass_pipeline_class, X_y_multi + mock_fit, + mock_score, + dummy_classifier_estimator_class, + dummy_multiclass_pipeline_class, + X_y_multi, ): X, y = X_y_multi automl = AutoMLSearch( X_train=X, y_train=y, problem_type="multiclass", - allowed_component_graphs=[{"Mock Multiclass Classification Pipeline": [dummy_classifier_estimator_class] }], + allowed_component_graphs=[ + { + "Mock Multiclass Classification Pipeline": [ + dummy_classifier_estimator_class + ] + } + ], allowed_model_families=None, ) expected_pipeline = dummy_multiclass_pipeline_class({}) @@ -944,7 +960,9 @@ def test_automl_component_graphs_init_allowed_both_specified_binary( X_train=X, y_train=y, problem_type="binary", - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] }], + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ], allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -981,7 +999,13 @@ def test_automl_component_graphs_init_allowed_both_specified_multi( X_train=X, y_train=y, problem_type="multiclass", - allowed_component_graphs=[{"Mock Multiclass Classification Pipeline": [dummy_classifier_estimator_class] }], + allowed_component_graphs=[ + { + "Mock Multiclass Classification Pipeline": [ + dummy_classifier_estimator_class + ] + } + ], allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -1059,11 +1083,13 @@ def test_automl_component_graphs_search( for i in range(1, 5): if problem_type == "binary": assert isinstance( - start_iteration_callback.call_args_list[i][0][0], BinaryClassificationPipeline + start_iteration_callback.call_args_list[i][0][0], + BinaryClassificationPipeline, ) elif problem_type == "multiclass": assert isinstance( - start_iteration_callback.call_args_list[i][0][0], MulticlassClassificationPipeline + start_iteration_callback.call_args_list[i][0][0], + MulticlassClassificationPipeline, ) diff --git a/evalml/tests/automl_tests/test_automl_search_regression.py b/evalml/tests/automl_tests/test_automl_search_regression.py index e2c25570d5..3688466eb0 100644 --- a/evalml/tests/automl_tests/test_automl_search_regression.py +++ b/evalml/tests/automl_tests/test_automl_search_regression.py @@ -230,7 +230,11 @@ def test_automl_component_graphs_no_allowed_component_graphs(X_y_regression): @patch("evalml.pipelines.RegressionPipeline.score") @patch("evalml.pipelines.RegressionPipeline.fit") def test_automl_allowed_component_graphs_specified_component_graphs( - mock_fit, mock_score, dummy_regressor_estimator_class, dummy_regression_pipeline_class, X_y_regression + mock_fit, + mock_score, + dummy_regressor_estimator_class, + dummy_regression_pipeline_class, + X_y_regression, ): X, y = X_y_regression @@ -238,7 +242,9 @@ def test_automl_allowed_component_graphs_specified_component_graphs( X_train=X, y_train=y, problem_type="regression", - allowed_component_graphs=[{"Mock Regression Pipeline": [dummy_regressor_estimator_class] }], + allowed_component_graphs=[ + {"Mock Regression Pipeline": [dummy_regressor_estimator_class]} + ], allowed_model_families=None, ) mock_score.return_value = {automl.objective.name: 1.0} @@ -351,7 +357,9 @@ def test_automl_allowed_component_graphs_init_allowed_both_specified( X_train=X, y_train=y, problem_type="regression", - allowed_component_graphs=[{"Mock Regression Pipeline": [dummy_regressor_estimator_class]}], + allowed_component_graphs=[ + {"Mock Regression Pipeline": [dummy_regressor_estimator_class]} + ], allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -396,8 +404,12 @@ def test_automl_allowed_component_graphs_search( automl.search() assert start_iteration_callback.call_count == 2 - assert isinstance(start_iteration_callback.call_args_list[0][0][0], RegressionPipeline) - assert isinstance(start_iteration_callback.call_args_list[1][0][0], RegressionPipeline) + assert isinstance( + start_iteration_callback.call_args_list[0][0][0], RegressionPipeline + ) + assert isinstance( + start_iteration_callback.call_args_list[1][0][0], RegressionPipeline + ) @patch("evalml.pipelines.TimeSeriesRegressionPipeline.score", return_value={"R2": 0.3}) diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 65654fcc52..1019684eb1 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -5,7 +5,6 @@ from sklearn.model_selection import KFold, StratifiedKFold from skopt.space import Categorical, Integer -from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.utils import ( _LARGE_DATA_PERCENT_VALIDATION, _LARGE_DATA_ROW_THRESHOLD, @@ -17,8 +16,11 @@ tune_binary_threshold, ) from evalml.objectives import F1, R2, LogLossBinary, LogLossMulticlass -from evalml.pipelines import BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline, \ - TimeSeriesRegressionPipeline +from evalml.pipelines import ( + BinaryClassificationPipeline, + MulticlassClassificationPipeline, + RegressionPipeline, +) from evalml.preprocessing.data_splitters import ( TimeSeriesSplit, TrainingValidationSplit, @@ -321,28 +323,46 @@ def test_get_hyperparameter_ranges(): assert algo_ranges == hyper_ranges -@pytest.mark.parametrize("problem_type,estimator", [("binary", "Random Forest Classifier"), - ("multiclass", "Random Forest Classifier"), - ("regression", "Random Forest Regressor"), - ("time series regression", "ARIMA Regressor")]) +@pytest.mark.parametrize( + "problem_type,estimator", + [ + ("binary", "Random Forest Classifier"), + ("multiclass", "Random Forest Classifier"), + ("regression", "Random Forest Regressor"), + ("time series regression", "ARIMA Regressor"), + ], +) def test_get_pipelines_from_component_graphs(problem_type, estimator): - component_graphs = [{"Name_0": ["Imputer", estimator], - "random_seed": 42}, - {"Name_1": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - estimator: [estimator, "Imputer_1"], - }}] + component_graphs = [ + {"Name_0": ["Imputer", estimator], "random_seed": 42}, + { + "Name_1": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + estimator: [estimator, "Imputer_1"], + } + }, + ] if problem_type == "time series regression": with pytest.raises(ValueError, match="date_index, gap, and max_delay"): get_pipelines_from_component_graphs(component_graphs, problem_type) else: - returned_pipelines = get_pipelines_from_component_graphs(component_graphs, problem_type) + returned_pipelines = get_pipelines_from_component_graphs( + component_graphs, problem_type + ) assert returned_pipelines[0].random_seed == 42 assert returned_pipelines[1].random_seed == 0 if problem_type == "binary": - assert all(isinstance(pipe_, BinaryClassificationPipeline) for pipe_ in returned_pipelines) + assert all( + isinstance(pipe_, BinaryClassificationPipeline) + for pipe_ in returned_pipelines + ) elif problem_type == "multiclass": - assert all(isinstance(pipe_, MulticlassClassificationPipeline) for pipe_ in returned_pipelines) + assert all( + isinstance(pipe_, MulticlassClassificationPipeline) + for pipe_ in returned_pipelines + ) elif problem_type == "regression": - assert all(isinstance(pipe_, RegressionPipeline) for pipe_ in returned_pipelines) + assert all( + isinstance(pipe_, RegressionPipeline) for pipe_ in returned_pipelines + ) diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index e2fed99ffb..fe187552ec 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -414,31 +414,12 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_regression( "parameters", [1, "hello", 1.3, -1.0006, Categorical([1, 3, 4]), Integer(2, 4), Real(2, 6)], ) -def test_iterative_algorithm_pipeline_params(parameters, dummy_binary_pipeline_classes, X_y_binary, dummy_classifier_estimator_class): +def test_iterative_algorithm_pipeline_params( + parameters, + dummy_binary_pipeline_classes, +): dummy_binary_pipeline_classes = dummy_binary_pipeline_classes(parameters) - class MockEstimator(Estimator): - name = "Mock Classifier" - model_family = ModelFamily.RANDOM_FOREST - supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] - if isinstance(hyperparameters, (list, tuple, Real, Categorical, Integer)): - hyperparameter_ranges = {"dummy_parameter": hyperparameters} - else: - hyperparameter_ranges = {"dummy_parameter": [hyperparameters]} - - def __init__( - self, dummy_parameter="default", n_jobs=-1, random_seed=0, **kwargs - ): - super().__init__( - parameters={ - "dummy_parameter": dummy_parameter, - **kwargs, - "n_jobs": n_jobs, - }, - component_obj=None, - random_seed=random_seed, - ) - if isinstance(parameters, (Categorical, Integer, Real)): with pytest.raises( ValueError, @@ -454,18 +435,6 @@ def __init__( ) return else: - X, y = X_y_binary - automl = AutoMLSearch( - X_train=X, - y_train=y, - problem_type="binary", - max_iterations=3, - allowed_component_graphs=[{"Mock Classifier": [dummy_classifier_estimator_class]}], - pipeline_parameters={ - "Mock Classifier": {"dummy_parameter": parameters}, - }, - problem_configuration={"gap": 2, "max_delay": 10}, - ) algo = IterativeAlgorithm( allowed_pipelines=dummy_binary_pipeline_classes, random_seed=0, @@ -480,13 +449,10 @@ def __init__( assert all( [p.parameters["pipeline"] == {"gap": 2, "max_delay": 10} for p in next_batch] ) - - next_batch = algo.next_batch() - - assert all([p.parameters["Mock Classifier"]["n_jobs"] == -1 for p in next_batch]) assert all( [ - p.parameters["Mock Classifier"]["dummy_parameter"] == parameters + p.parameters["Mock Classifier"] + == {"dummy_parameter": parameter, "n_jobs": -1} for p in next_batch ] ) @@ -768,7 +734,7 @@ def test_iterative_algorithm_sampling_params( for p in next_batch: for component in p.component_graph: if "sampler" in component.name: - assert component.parameters["sampling_ratio"] == 0.5 + assert component.parameters["sampling_ratio"] == 0.25 scores = np.arange(0, len(next_batch)) for score, pipeline in zip(scores, next_batch): @@ -780,4 +746,4 @@ def test_iterative_algorithm_sampling_params( for p in next_batch: for component in p.component_graph: if "sampler" in component.name: - assert component.parameters["sampling_ratio"] == 0.5 + assert component.parameters["sampling_ratio"] == 0.25 diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 17f68ea7d1..c760b8247e 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -20,7 +20,7 @@ RegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, - TimeSeriesRegressionPipeline, ComponentGraph, RandomForestClassifier, + TimeSeriesRegressionPipeline, ) from evalml.pipelines.components import ( DecisionTreeClassifier, @@ -393,65 +393,81 @@ def fit(self, X, y): @pytest.fixture def dummy_classifier_linear_component_graph(dummy_classifier_estimator_class): - component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class]} + component_graph_linear = { + "Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class] + } return component_graph_linear @pytest.fixture def dummy_binary_linear_component_graph(dummy_binary_estimator_class): - component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_binary_estimator_class]} + component_graph_linear = { + "Name": ["Imputer", "One Hot Encoder", dummy_binary_estimator_class] + } return component_graph_linear @pytest.fixture def dummy_multiclass_linear_component_graph(dummy_multiclass_estimator_class): - component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_multiclass_estimator_class]} + component_graph_linear = { + "Name": ["Imputer", "One Hot Encoder", dummy_multiclass_estimator_class] + } return component_graph_linear @pytest.fixture def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): - component_graph_linear = {"Name": ["Imputer", "One Hot Encoder", dummy_regressor_estimator_class]} + component_graph_linear = { + "Name": ["Imputer", "One Hot Encoder", dummy_regressor_estimator_class] + } return component_graph_linear @pytest.fixture def dummy_classifier_dict_component_graph(dummy_classifier_estimator_class): - component_graph_dict = {"Name": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": [dummy_classifier_estimator_class, "Imputer_1"], - }} + component_graph_dict = { + "Name": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_classifier_estimator_class, "Imputer_1"], + } + } return component_graph_dict @pytest.fixture def dummy_binary_dict_component_graph(dummy_binary_estimator_class): - component_graph_dict = {"Name": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": [dummy_binary_estimator_class, "Imputer_1"], - }} + component_graph_dict = { + "Name": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_binary_estimator_class, "Imputer_1"], + } + } return component_graph_dict @pytest.fixture def dummy_multiclass_dict_component_graph(dummy_multiclass_estimator_class): - component_graph_dict = {"Name": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": [dummy_multiclass_estimator_class, "Imputer_1"], - }} + component_graph_dict = { + "Name": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_multiclass_estimator_class, "Imputer_1"], + } + } return component_graph_dict @pytest.fixture def dummy_regressor_dict_component_graph(dummy_regressor_estimator_class): - component_graph_dict = {"Name": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": [dummy_regressor_estimator_class, "Imputer_1"], - }} + component_graph_dict = { + "Name": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "Random Forest Classifier": [dummy_regressor_estimator_class, "Imputer_1"], + } + } return component_graph_dict From fc9bc2ae11387d3b9265f26c8be5f1b14393bc15 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 03:17:00 -0400 Subject: [PATCH 63/85] lint fixes --- evalml/automl/automl_search.py | 1 - evalml/tests/automl_tests/test_automl.py | 11 +++++++---- evalml/tests/automl_tests/test_iterative_algorithm.py | 1 - 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 191530c7d1..a8ff005451 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -38,7 +38,6 @@ from evalml.pipelines import ( BinaryClassificationPipeline, MulticlassClassificationPipeline, - PipelineBase, RegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index cd006e2a5e..c8fcbd9ec2 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -916,10 +916,14 @@ def test_component_graph_with_incorrect_problem_type( ) -def test_component_graph_with_nonunique_names(X_y_binary, dummy_classifier_estimator_class): +def test_component_graph_with_nonunique_names( + X_y_binary, dummy_classifier_estimator_class +): X, y = X_y_binary - with pytest.raises(ValueError, match="Every name of allowed_component_graphs must be unique!"): + with pytest.raises( + ValueError, match="Every name of allowed_component_graphs must be unique!" + ): AutoMLSearch( X_train=X, y_train=y, @@ -927,12 +931,11 @@ def test_component_graph_with_nonunique_names(X_y_binary, dummy_classifier_estim allowed_component_graphs=[ {"Name_0": [dummy_classifier_estimator_class]}, {"Name_1": [dummy_classifier_estimator_class]}, - {"Name_0": [dummy_classifier_estimator_class]} + {"Name_0": [dummy_classifier_estimator_class]}, ], ) - def test_main_objective_problem_type_mismatch(X_y_binary): X, y = X_y_binary with pytest.raises(ValueError, match="is not compatible with a"): diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index fe187552ec..f7aaa10a79 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -4,7 +4,6 @@ import pytest from skopt.space import Categorical, Integer, Real -from evalml import AutoMLSearch from evalml.automl.automl_algorithm import ( AutoMLAlgorithmException, IterativeAlgorithm, From b58f6cdf0337f692da4dd8364413fc28fd54fbc6 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 03:25:12 -0400 Subject: [PATCH 64/85] lint fixes and update engine tests --- dev-requirements.txt | 1 + evalml/tests/automl_tests/test_engine_base.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index e1d4243a59..e2b3876b0d 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,3 +4,4 @@ flake8==3.7.0 black==21.5b1 isort==5.0.0 + diff --git a/evalml/tests/automl_tests/test_engine_base.py b/evalml/tests/automl_tests/test_engine_base.py index 200c9137cf..53d3a7b6f9 100644 --- a/evalml/tests/automl_tests/test_engine_base.py +++ b/evalml/tests/automl_tests/test_engine_base.py @@ -14,7 +14,7 @@ @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_train_and_score_pipelines( - mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary + mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary ): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 0.42} @@ -24,7 +24,7 @@ def test_train_and_score_pipelines( problem_type="binary", max_time=1, max_batches=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], ) pipeline = dummy_binary_pipeline_class({}) evaluation_result = evaluate_pipeline( @@ -51,7 +51,7 @@ def test_train_and_score_pipelines( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_train_and_score_pipelines_error( - mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary, caplog + mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary, caplog ): X, y = X_y_binary mock_score.side_effect = Exception("yeet") @@ -61,7 +61,7 @@ def test_train_and_score_pipelines_error( problem_type="binary", max_time=1, max_batches=1, - allowed_pipelines=[dummy_binary_pipeline_class({})], + allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], ) pipeline = dummy_binary_pipeline_class({}) From 2e8401e33a75b00a84c75deae1e2900287fa8019 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 03:32:57 -0400 Subject: [PATCH 65/85] docs update --- docs/source/user_guide/automl.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index d9fab25bdf..16d090e73c 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -220,7 +220,7 @@ "source": [ "### Using custom pipelines\n", "\n", - "EvalML's AutoML algorithm generates a set of pipelines to search with. To provide a custom set instead, set allowed_pipelines to a list of custom pipeline instances. Note: this will prevent AutoML from generating other pipelines to search over." + "EvalML's AutoML algorithm generates a set of pipelines to search with. To provide a custom set instead, set allowed_component_graphs to a list of custom component graphs. `AutoMLSearch` will use these to generate `Pipeline` instances. Note: this will prevent AutoML from generating other pipelines to search over." ] }, { @@ -235,7 +235,7 @@ "automl_custom = evalml.automl.AutoMLSearch(X_train=X_train,\n", " y_train=y_train,\n", " problem_type='multiclass',\n", - " allowed_pipelines=[MulticlassClassificationPipeline(component_graph=['Simple Imputer', 'Random Forest Classifier'])])" + " allowed_component_graphs=[\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']])" ] }, { From 291e188daadd8f1453da5cbbafad87f7b652cf1d Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 03:35:59 -0400 Subject: [PATCH 66/85] lint fixes --- evalml/automl/automl_search.py | 2 +- evalml/automl/utils.py | 2 +- evalml/tests/automl_tests/test_automl_utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index a8ff005451..4452ccbc73 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -20,8 +20,8 @@ check_all_pipeline_names_unique, get_best_sampler_for_data, get_default_primary_search_objective, - make_data_splitter, get_pipelines_from_component_graphs, + make_data_splitter, ) from evalml.data_checks import DefaultDataChecks from evalml.exceptions import ( diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index 8adf2f5e89..b36250cb68 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -6,8 +6,8 @@ from evalml.objectives import get_objective from evalml.pipelines import ( - ComponentGraph, BinaryClassificationPipeline, + ComponentGraph, MulticlassClassificationPipeline, RegressionPipeline, TimeSeriesBinaryClassificationPipeline, diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 1019684eb1..5cffef7462 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -9,9 +9,9 @@ _LARGE_DATA_PERCENT_VALIDATION, _LARGE_DATA_ROW_THRESHOLD, get_best_sampler_for_data, - get_pipelines_from_component_graphs, get_default_primary_search_objective, get_hyperparameter_ranges, + get_pipelines_from_component_graphs, make_data_splitter, tune_binary_threshold, ) From aa4d8a50debd399842bb485e67d06c543103a13d Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 10:16:14 -0400 Subject: [PATCH 67/85] test fixes --- evalml/automl/automl_search.py | 1 - evalml/objectives/standard_metrics.py | 1 - evalml/tests/automl_tests/test_automl.py | 58 ++++++++++++++---------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 4452ccbc73..0784ab5d90 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -494,7 +494,6 @@ def __init__( raise ValueError("No allowed pipelines to search") logger.info(f"{len(self.allowed_pipelines)} pipelines ready for search.") - check_all_pipeline_names_unique(self.allowed_pipelines) run_ensembling = self.ensembling text_in_ensembling = len(self.X_train.ww.select("natural_language").columns) > 0 diff --git a/evalml/objectives/standard_metrics.py b/evalml/objectives/standard_metrics.py index d10dabd320..6fbfbded7c 100644 --- a/evalml/objectives/standard_metrics.py +++ b/evalml/objectives/standard_metrics.py @@ -414,7 +414,6 @@ class R2(RegressionObjective): is_bounded_like_percentage = False # Range (-Inf, 1] def objective_function(self, y_true, y_predicted, X=None): - print(f"standard metrics - R2 - score: {metrics.r2_score(y_true, y_predicted)}") return metrics.r2_score(y_true, y_predicted) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index c8fcbd9ec2..eb1add009d 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -3726,15 +3726,12 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen custom_hyperparameters=custom_hyperparameters, max_batches=5, ) - from pprint import pp automl.search() for i, row in automl.full_rankings.iterrows(): if "Mode Baseline Binary" in row["pipeline_name"]: continue if row["pipeline_name"] == "Name_linear": - print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") - pp(row.parameters) assert row["parameters"]["Imputer"]["numeric_impute_strategy"] == "mean" assert row["parameters"]["Imputer_1"]["numeric_impute_strategy"] in { "most_frequent", @@ -3930,15 +3927,17 @@ def test_automl_check_high_variance_logs_warning(mock_fit_binary, X_y_binary, ca assert "High coefficient of variation" in out -def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): +def test_automl_raises_error_with_duplicate_pipeline_names(dummy_classifier_estimator_class, + X_y_binary): X, y = X_y_binary - component_graph_0 = {"Custom Pipeline": ["Imputer", "Random Forest Classifier"]} - component_graph_1 = { - "Custom Pipeline": ["Imputer", "Logistic Regression Classifier"] - } - component_graph_2 = {"My Pipeline 3": ["Logistic Regression Classifier"]} - component_graph_3 = {"My Pipeline 3": ["Random Forest Classifier"]} + class MyPipeline(BinaryClassificationPipeline): + estimator = dummy_classifier_estimator_class + + pipeline_0 = MyPipeline(custom_name="Custom Pipeline", component_graph=[dummy_classifier_estimator_class]) + pipeline_1 = MyPipeline(custom_name="Custom Pipeline", component_graph=[dummy_classifier_estimator_class]) + pipeline_2 = MyPipeline(custom_name="My Pipeline 3", component_graph=[dummy_classifier_estimator_class]) + pipeline_3 = MyPipeline(custom_name="My Pipeline 3", component_graph=[dummy_classifier_estimator_class]) with pytest.raises( ValueError, @@ -3948,12 +3947,11 @@ def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y, problem_type="binary", - allowed_component_graphs=[ - component_graph_0, - component_graph_1, - component_graph_2, - ], - ) + ).train_pipelines([ + pipeline_0, + pipeline_1, + pipeline_2, + ],) with pytest.raises( ValueError, @@ -3963,13 +3961,27 @@ def test_automl_raises_error_with_duplicate_pipeline_names(X_y_binary): X, y, problem_type="binary", - allowed_component_graphs=[ - component_graph_0, - component_graph_1, - component_graph_2, - component_graph_3, - ], - ) + ).train_pipelines([ + pipeline_0, + pipeline_1, + pipeline_2, + pipeline_3, + ], ) + + with pytest.raises( + ValueError, + match="All pipeline names must be unique. The names 'Custom Pipeline', 'My Pipeline 3' were repeated.", + ): + AutoMLSearch( + X, + y, + problem_type="binary", + ).score_pipelines([ + pipeline_0, + pipeline_1, + pipeline_2, + pipeline_3, + ], pd.DataFrame(), pd.Series(), None) @patch("evalml.pipelines.BinaryClassificationPipeline.score") From 1bd99a76b8ab60326ae0c1dbb6c7e93005fa29b7 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 10:18:47 -0400 Subject: [PATCH 68/85] lint fix --- evalml/tests/automl_tests/test_automl.py | 72 ++++++++++--------- evalml/tests/automl_tests/test_engine_base.py | 21 ++++-- 2 files changed, 57 insertions(+), 36 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index eb1add009d..8c02736661 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -3927,61 +3927,69 @@ def test_automl_check_high_variance_logs_warning(mock_fit_binary, X_y_binary, ca assert "High coefficient of variation" in out -def test_automl_raises_error_with_duplicate_pipeline_names(dummy_classifier_estimator_class, - X_y_binary): +def test_automl_raises_error_with_duplicate_pipeline_names( + dummy_classifier_estimator_class, X_y_binary +): X, y = X_y_binary class MyPipeline(BinaryClassificationPipeline): estimator = dummy_classifier_estimator_class - pipeline_0 = MyPipeline(custom_name="Custom Pipeline", component_graph=[dummy_classifier_estimator_class]) - pipeline_1 = MyPipeline(custom_name="Custom Pipeline", component_graph=[dummy_classifier_estimator_class]) - pipeline_2 = MyPipeline(custom_name="My Pipeline 3", component_graph=[dummy_classifier_estimator_class]) - pipeline_3 = MyPipeline(custom_name="My Pipeline 3", component_graph=[dummy_classifier_estimator_class]) + pipeline_0 = MyPipeline( + custom_name="Custom Pipeline", + component_graph=[dummy_classifier_estimator_class], + ) + pipeline_1 = MyPipeline( + custom_name="Custom Pipeline", + component_graph=[dummy_classifier_estimator_class], + ) + pipeline_2 = MyPipeline( + custom_name="My Pipeline 3", component_graph=[dummy_classifier_estimator_class] + ) + pipeline_3 = MyPipeline( + custom_name="My Pipeline 3", component_graph=[dummy_classifier_estimator_class] + ) with pytest.raises( ValueError, match="All pipeline names must be unique. The name 'Custom Pipeline' was repeated.", ): - AutoMLSearch( - X, - y, - problem_type="binary", - ).train_pipelines([ + AutoMLSearch(X, y, problem_type="binary",).train_pipelines( + [ pipeline_0, pipeline_1, pipeline_2, - ],) + ], + ) with pytest.raises( ValueError, match="All pipeline names must be unique. The names 'Custom Pipeline', 'My Pipeline 3' were repeated.", ): - AutoMLSearch( - X, - y, - problem_type="binary", - ).train_pipelines([ - pipeline_0, - pipeline_1, - pipeline_2, - pipeline_3, - ], ) + AutoMLSearch(X, y, problem_type="binary",).train_pipelines( + [ + pipeline_0, + pipeline_1, + pipeline_2, + pipeline_3, + ], + ) with pytest.raises( ValueError, match="All pipeline names must be unique. The names 'Custom Pipeline', 'My Pipeline 3' were repeated.", ): - AutoMLSearch( - X, - y, - problem_type="binary", - ).score_pipelines([ - pipeline_0, - pipeline_1, - pipeline_2, - pipeline_3, - ], pd.DataFrame(), pd.Series(), None) + AutoMLSearch(X, y, problem_type="binary",).score_pipelines( + [ + pipeline_0, + pipeline_1, + pipeline_2, + pipeline_3, + ], + pd.DataFrame(), + pd.Series(), + None, + ) @patch("evalml.pipelines.BinaryClassificationPipeline.score") diff --git a/evalml/tests/automl_tests/test_engine_base.py b/evalml/tests/automl_tests/test_engine_base.py index 53d3a7b6f9..f72153dc81 100644 --- a/evalml/tests/automl_tests/test_engine_base.py +++ b/evalml/tests/automl_tests/test_engine_base.py @@ -14,7 +14,11 @@ @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_train_and_score_pipelines( - mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary + mock_fit, + mock_score, + dummy_classifier_estimator_class, + dummy_binary_pipeline_class, + X_y_binary, ): X, y = X_y_binary mock_score.return_value = {"Log Loss Binary": 0.42} @@ -24,7 +28,9 @@ def test_train_and_score_pipelines( problem_type="binary", max_time=1, max_batches=1, - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ], ) pipeline = dummy_binary_pipeline_class({}) evaluation_result = evaluate_pipeline( @@ -51,7 +57,12 @@ def test_train_and_score_pipelines( @patch("evalml.pipelines.BinaryClassificationPipeline.score") @patch("evalml.pipelines.BinaryClassificationPipeline.fit") def test_train_and_score_pipelines_error( - mock_fit, mock_score, dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary, caplog + mock_fit, + mock_score, + dummy_classifier_estimator_class, + dummy_binary_pipeline_class, + X_y_binary, + caplog, ): X, y = X_y_binary mock_score.side_effect = Exception("yeet") @@ -61,7 +72,9 @@ def test_train_and_score_pipelines_error( problem_type="binary", max_time=1, max_batches=1, - allowed_component_graphs=[{"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}], + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + ], ) pipeline = dummy_binary_pipeline_class({}) From 38745022bdefc352edcd1086fb8fd870c03f97bf Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 11:37:47 -0400 Subject: [PATCH 69/85] test update --- evalml/tests/automl_tests/test_automl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 8c02736661..bd13f9256b 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2196,7 +2196,7 @@ def test_time_series_regression_with_parameters(ts_data): X_train=X, y_train=y, problem_type="time series regression", - allowed_component_graphs=[{"Name_0": ["Imputer", "ARIMA Regressor"]}], + allowed_component_graphs=[{"Name_0": ["Imputer", "Linear Regressor"]}], objective="auto", problem_configuration=problem_configuration, max_batches=3, From 85c296e19cb34211de44d84e3f614cd90a118920 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 12:21:06 -0400 Subject: [PATCH 70/85] docs update --- docs/source/user_guide/automl.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index 16d090e73c..071a5cbf59 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -235,7 +235,7 @@ "automl_custom = evalml.automl.AutoMLSearch(X_train=X_train,\n", " y_train=y_train,\n", " problem_type='multiclass',\n", - " allowed_component_graphs=[\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']])" + " allowed_component_graphs={\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']})" ] }, { From f4bd833d254b61179ff96a3acc0368f7997b3225 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 14:10:25 -0400 Subject: [PATCH 71/85] conf test clean up --- evalml/tests/conftest.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index c760b8247e..1682909fa4 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -363,9 +363,6 @@ def __init__(self, a=1, b=0, random_seed=0): parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed ) - def fit(self, X, y): - return self - return MockEstimator @@ -385,9 +382,6 @@ def __init__(self, a=1, b=0, random_seed=0): parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed ) - def fit(self, X, y): - return self - return MockEstimator @@ -423,18 +417,6 @@ def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): return component_graph_linear -@pytest.fixture -def dummy_classifier_dict_component_graph(dummy_classifier_estimator_class): - component_graph_dict = { - "Name": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": [dummy_classifier_estimator_class, "Imputer_1"], - } - } - return component_graph_dict - - @pytest.fixture def dummy_binary_dict_component_graph(dummy_binary_estimator_class): component_graph_dict = { From 5fd09c44cd8615866db8b8f9e49d808a65e4622c Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 14:13:30 -0400 Subject: [PATCH 72/85] test update --- evalml/automl/automl_algorithm/iterative_algorithm.py | 6 ------ evalml/tests/automl_tests/test_automl.py | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 3b5fb09b87..f7a12021bb 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -257,12 +257,6 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): - if isinstance(value, (Integer, Real, Categorical)): - raise ValueError( - "Pipeline parameters should not contain skopt.Space variables, please pass them " - "to custom_hyperparameters instead!" - ) - else: component_parameters[param_name] = value # Inspects each component and adds the following parameters when needed if "n_jobs" in init_params: diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index bd13f9256b..723f8548f9 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -3741,7 +3741,7 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen 100, 125, } - if row["pipeline_name"] == "Pipe Line Linear": + if row["pipeline_name"] == "Name_dict": assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in { "most_frequent", "mean", From e0d54e59776869257dfbf0be618665984897b17e Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 10 Jun 2021 14:25:34 -0400 Subject: [PATCH 73/85] lint and doc fix --- docs/source/user_guide/automl.ipynb | 2 +- evalml/automl/automl_algorithm/iterative_algorithm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index 071a5cbf59..9edfb497cf 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -235,7 +235,7 @@ "automl_custom = evalml.automl.AutoMLSearch(X_train=X_train,\n", " y_train=y_train,\n", " problem_type='multiclass',\n", - " allowed_component_graphs={\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']})" + " allowed_component_graphs=[{\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']}])" ] }, { diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index f7a12021bb..6a5df058e8 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -257,7 +257,7 @@ def _transform_parameters(self, pipeline, proposed_parameters): component_parameters[param_name] = value if name in self._pipeline_params and self._batch_number == 0: for param_name, value in self._pipeline_params[name].items(): - component_parameters[param_name] = value + component_parameters[param_name] = value # Inspects each component and adds the following parameters when needed if "n_jobs" in init_params: component_parameters["n_jobs"] = self.n_jobs From 5a3d5a331d7bde1b0e4c041bc4825e28d3f891a8 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 14 Jun 2021 17:59:57 +0100 Subject: [PATCH 74/85] test updates --- docs/source/user_guide/automl.ipynb | 4 +++- evalml/automl/automl_algorithm/iterative_algorithm.py | 10 +++------- evalml/automl/automl_search.py | 3 ++- evalml/tests/automl_tests/test_automl.py | 1 - 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index 9edfb497cf..3dd1fd5bdc 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -235,7 +235,9 @@ "automl_custom = evalml.automl.AutoMLSearch(X_train=X_train,\n", " y_train=y_train,\n", " problem_type='multiclass',\n", - " allowed_component_graphs=[{\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']}])" + " allowed_component_graphs=[{\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier'],\n", + " \"random_seed\": 42},\n", + " {\"My_other_pipeline\": ['One Hot Encoder', 'Random Forest Classifier']}])" ] }, { diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 6a5df058e8..c9ff43102b 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -133,7 +133,7 @@ def next_batch(self): if self._batch_number == 0: next_batch = [ pipeline.new( - parameters=self._combine_parameters(pipeline, {}), + parameters={**self._transform_parameters(pipeline, {})}, random_seed=self.random_seed, ) for pipeline in self.allowed_pipelines @@ -149,7 +149,7 @@ def next_batch(self): for pipeline_dict in self._best_pipeline_info.values(): pipeline = pipeline_dict["pipeline"] pipeline_params = pipeline_dict["parameters"] - parameters = self._combine_parameters(pipeline, pipeline_params) + parameters = {**self._transform_parameters(pipeline, pipeline_params)} input_pipelines.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) @@ -172,7 +172,7 @@ def next_batch(self): pipeline = self._first_batch_results[idx][1] for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() - parameters = self._combine_parameters(pipeline, proposed_parameters) + parameters = {**self._transform_parameters(pipeline, proposed_parameters)} next_batch.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) @@ -180,10 +180,6 @@ def next_batch(self): self._batch_number += 1 return next_batch - def _combine_parameters(self, pipeline, proposed_parameters): - """Helper function for logic to transform proposed parameters.""" - return {**self._transform_parameters(pipeline, proposed_parameters)} - def add_result(self, score_to_minimize, pipeline, trained_pipeline_results): """Register results from evaluating a pipeline diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 0784ab5d90..e00e40ed38 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -189,7 +189,8 @@ def __init__( Only applicable if patience is not None. Defaults to None. allowed_component_graphs (list): A list of dictionaries indicating the component graphs allowed in the search. - The format should follow [ {Name_of_graph: [list_of_components]} ]. + The format should follow [ {Name_of_graph: [list_of_components], + random_seed: 42} ]. The default of None indicates all pipeline component graphs for this problem type are allowed. Setting this field will cause allowed_model_families to be ignored. diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 723f8548f9..0184324460 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -2201,7 +2201,6 @@ def test_time_series_regression_with_parameters(ts_data): problem_configuration=problem_configuration, max_batches=3, ) - automl.search() assert automl.allowed_pipelines[0].parameters["pipeline"] == problem_configuration From a2aa4b07c591e527141284c6fa6d68efaac07dc1 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 14 Jun 2021 18:57:15 +0100 Subject: [PATCH 75/85] add component graph check --- .../automl_algorithm/iterative_algorithm.py | 4 ++- evalml/automl/automl_search.py | 7 ++-- evalml/tests/automl_tests/test_automl.py | 34 ++++++++++++++++++- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index c9ff43102b..1fea044170 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -172,7 +172,9 @@ def next_batch(self): pipeline = self._first_batch_results[idx][1] for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() - parameters = {**self._transform_parameters(pipeline, proposed_parameters)} + parameters = { + **self._transform_parameters(pipeline, proposed_parameters) + } next_batch.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index e00e40ed38..99fa677d06 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -36,6 +36,7 @@ get_objective, ) from evalml.pipelines import ( + ComponentGraph, BinaryClassificationPipeline, MulticlassClassificationPipeline, RegressionPipeline, @@ -188,7 +189,7 @@ def __init__( tolerance (float): Minimum percentage difference to qualify as score improvement for early stopping. Only applicable if patience is not None. Defaults to None. - allowed_component_graphs (list): A list of dictionaries indicating the component graphs allowed in the search. + allowed_component_graphs (list): A list of dictionaries or ComponentGraphs indicating the component graphs allowed in the search. The format should follow [ {Name_of_graph: [list_of_components], random_seed: 42} ]. The default of None indicates all pipeline component graphs for this problem type are allowed. Setting this field will cause @@ -388,9 +389,9 @@ def __init__( "Parameter allowed_component_graphs must be either None or a list!" ) for graph in allowed_component_graphs: - if not isinstance(graph, dict): + if not isinstance(graph, (dict, ComponentGraph)): raise ValueError( - "Every component graph passed must be of type dictionary!" + "Every component graph passed must be of type dictionary or ComponentGraph!" ) unique_names = set() for graph in allowed_component_graphs: diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 0184324460..f132941658 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -47,6 +47,7 @@ get_objective, ) from evalml.pipelines import ( + ComponentGraph, BinaryClassificationPipeline, Estimator, MulticlassClassificationPipeline, @@ -2204,6 +2205,37 @@ def test_time_series_regression_with_parameters(ts_data): assert automl.allowed_pipelines[0].parameters["pipeline"] == problem_configuration +@pytest.mark.parametrize("graph_type", ["linear", "dict"]) +def test_automl_accepts_component_graphs( + graph_type, dummy_classifier_estimator_class, X_y_binary +): + X, y = X_y_binary + if graph_type == "linear": + component_graph = ComponentGraph().from_list( + ["Imputer", "Logistic Regression Classifier"] + ) + else: + component_dict = { + "imputer": ["Imputer"], + "ohe": ["One Hot Encoder", "imputer.x"], + "estimator_1": ["Random Forest Classifier", "ohe.x"], + "estimator_2": ["Decision Tree Classifier", "ohe.x"], + "final": ["Logistic Regression Classifier", "estimator_1", "estimator_2"], + } + component_graph = ComponentGraph(component_dict) + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + allowed_component_graphs=[{"Dummy_Name": component_graph}], + objective="auto", + max_batches=3, + ) + for pipeline_ in automl.allowed_pipelines: + assert isinstance(pipeline_, BinaryClassificationPipeline) + assert pipeline_.component_graph == component_graph + + @pytest.mark.parametrize("fold_scores", [[2, 4, 6], [np.nan, 4, 6]]) @patch( "evalml.pipelines.BinaryClassificationPipeline.score", @@ -4489,7 +4521,7 @@ def test_automl_validates_data_passed_in_to_allowed_component_graphs( with pytest.raises( ValueError, - match="Every component graph passed must be of type dictionary!", + match="Every component graph passed must be of type dictionary or ComponentGraph!", ): AutoMLSearch( X, From 9cef3a5b95eda6731e66e2f72278a5b3f32814a7 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 14 Jun 2021 19:48:06 +0100 Subject: [PATCH 76/85] lint fix --- evalml/automl/automl_search.py | 2 +- evalml/tests/automl_tests/test_automl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 99fa677d06..21239fcd6b 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -36,8 +36,8 @@ get_objective, ) from evalml.pipelines import ( - ComponentGraph, BinaryClassificationPipeline, + ComponentGraph, MulticlassClassificationPipeline, RegressionPipeline, TimeSeriesBinaryClassificationPipeline, diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index f132941658..39d7e79288 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -34,9 +34,9 @@ ) from evalml.model_family import ModelFamily from evalml.objectives import ( - F1, BinaryClassificationObjective, CostBenefitMatrix, + F1, FraudCost, RegressionObjective, ) From a4450a19e00bb73904711d4944813eaad2fcdde9 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 14 Jun 2021 19:53:47 +0100 Subject: [PATCH 77/85] lint fix --- evalml/tests/automl_tests/test_automl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 39d7e79288..8c93239cd8 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -13,7 +13,9 @@ from skopt.space import Categorical, Integer, Real from evalml import AutoMLSearch -from evalml.automl.automl_algorithm import IterativeAlgorithm +from evalml.automl.automl_algorithm import ( + IterativeAlgorithm +) from evalml.automl.callbacks import ( log_error_callback, raise_error_callback, From ab05f3f22d30915a1d2121bde9cf91c55a9f554a Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 14 Jun 2021 19:59:26 +0100 Subject: [PATCH 78/85] lint fix --- evalml/tests/automl_tests/test_automl.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 8c93239cd8..e0fdce480b 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -13,9 +13,7 @@ from skopt.space import Categorical, Integer, Real from evalml import AutoMLSearch -from evalml.automl.automl_algorithm import ( - IterativeAlgorithm -) +from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.callbacks import ( log_error_callback, raise_error_callback, @@ -36,9 +34,9 @@ ) from evalml.model_family import ModelFamily from evalml.objectives import ( + F1, BinaryClassificationObjective, CostBenefitMatrix, - F1, FraudCost, RegressionObjective, ) @@ -49,8 +47,8 @@ get_objective, ) from evalml.pipelines import ( - ComponentGraph, BinaryClassificationPipeline, + ComponentGraph, Estimator, MulticlassClassificationPipeline, PipelineBase, From 969dffadf08a83871da2218e641b372d6d32acbd Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 17 Jun 2021 16:34:42 +0100 Subject: [PATCH 79/85] Update allowed_component_graphs format, update tests --- .../automl_algorithm/iterative_algorithm.py | 8 +- evalml/automl/automl_search.py | 28 +-- evalml/automl/utils.py | 18 +- evalml/tests/automl_tests/test_automl.py | 192 ++++++++---------- .../test_automl_search_classification.py | 14 +- evalml/tests/conftest.py | 32 +-- 6 files changed, 109 insertions(+), 183 deletions(-) diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 1fea044170..0727d18644 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -133,7 +133,7 @@ def next_batch(self): if self._batch_number == 0: next_batch = [ pipeline.new( - parameters={**self._transform_parameters(pipeline, {})}, + parameters=self._transform_parameters(pipeline, {}), random_seed=self.random_seed, ) for pipeline in self.allowed_pipelines @@ -149,7 +149,7 @@ def next_batch(self): for pipeline_dict in self._best_pipeline_info.values(): pipeline = pipeline_dict["pipeline"] pipeline_params = pipeline_dict["parameters"] - parameters = {**self._transform_parameters(pipeline, pipeline_params)} + parameters = self._transform_parameters(pipeline, pipeline_params) input_pipelines.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) @@ -172,9 +172,7 @@ def next_batch(self): pipeline = self._first_batch_results[idx][1] for i in range(self.pipelines_per_batch): proposed_parameters = self._tuners[pipeline.name].propose() - parameters = { - **self._transform_parameters(pipeline, proposed_parameters) - } + parameters = self._transform_parameters(pipeline, proposed_parameters) next_batch.append( pipeline.new(parameters=parameters, random_seed=self.random_seed) ) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 332cfa5e7c..d3b3110510 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -191,13 +191,14 @@ def __init__( tolerance (float): Minimum percentage difference to qualify as score improvement for early stopping. Only applicable if patience is not None. Defaults to None. - allowed_component_graphs (list): A list of dictionaries or ComponentGraphs indicating the component graphs allowed in the search. - The format should follow [ {Name_of_graph: [list_of_components], - random_seed: 42} ]. + allowed_component_graphs (dict): A dictionary of lists or ComponentGraphs indicating the component graphs allowed in the search. + The format should follow { "Name_0": [list_of_components], + "Name_1": [ComponentGraph(...)] } + The default of None indicates all pipeline component graphs for this problem type are allowed. Setting this field will cause allowed_model_families to be ignored. - e.g. allowed_component_graphs = [ {"Name": ["Imputer", "One Hot Encoder", dummy_classifier_estimator_class]} ] + e.g. allowed_component_graphs = { "My_Graph": ["Imputer", "One Hot Encoder", "Random Forest Classifier"] } allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary` @@ -405,22 +406,15 @@ def __init__( "Unable to import plotly; skipping pipeline search plotting\n" ) if allowed_component_graphs is not None: - if not isinstance(allowed_component_graphs, list): + if not isinstance(allowed_component_graphs, dict): raise ValueError( - "Parameter allowed_component_graphs must be either None or a list!" + "Parameter allowed_component_graphs must be either None or a dictionary!" ) - for graph in allowed_component_graphs: - if not isinstance(graph, (dict, ComponentGraph)): + for graph_name, graph in allowed_component_graphs.items(): + if not isinstance(graph, (list, dict, ComponentGraph)): raise ValueError( - "Every component graph passed must be of type dictionary or ComponentGraph!" + "Every component graph passed must be of type list, dictionary, or ComponentGraph!" ) - unique_names = set() - for graph in allowed_component_graphs: - unique_names.add(list(graph.keys())[0]) - if len(unique_names) < len(allowed_component_graphs): - raise ValueError( - "Every name of allowed_component_graphs must be unique!" - ) self.allowed_component_graphs = allowed_component_graphs self.allowed_model_families = allowed_model_families self._automl_algorithm = None @@ -510,7 +504,7 @@ def __init__( ] else: self.allowed_pipelines = get_pipelines_from_component_graphs( - self.allowed_component_graphs, self.problem_type, parameters + self.allowed_component_graphs, self.problem_type, parameters, self.random_seed ) if self.allowed_pipelines == []: diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index ad191ea52f..1dc769c095 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -231,15 +231,16 @@ def get_hyperparameter_ranges(component_graph, custom_hyperparameters): def get_pipelines_from_component_graphs( - component_graphs_list, problem_type, parameters=None + component_graphs_dict, problem_type, parameters=None, random_seed=0 ): """ Returns created pipelines from passed component graphs based on the specified problem type. Arguments: - component_graphs_list (list): The list of component graphs. + component_graphs_dict (dict): The dict of component graphs. problem_type (str or ProblemType): The problem type for which pipelines will be created. parameters (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. + random_seed (int): Random seed. Returns: list: List of pipelines made from the passed component graphs. @@ -253,18 +254,13 @@ def get_pipelines_from_component_graphs( ProblemTypes.TIME_SERIES_REGRESSION: TimeSeriesRegressionPipeline, }[handle_problem_types(problem_type)] created_pipelines = [] - for component_graph in component_graphs_list: - comp_seed = 0 - if "random_seed" in component_graph.keys(): - comp_seed = component_graph.pop("random_seed") - comp_name = next(iter(component_graph)) - comp_graph = component_graph[comp_name] + for graph_name, component_graph in component_graphs_dict.items(): created_pipelines.append( pipeline_class( - component_graph=comp_graph, + component_graph=component_graph, parameters=parameters, - custom_name=comp_name, - random_seed=comp_seed, + custom_name=graph_name, + random_seed=random_seed, ) ) return created_pipelines diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 1af8575413..24d9b0d13d 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -556,14 +556,13 @@ def test_automl_feature_selection( problem_type="binary", max_iterations=2, start_iteration_callback=start_iteration_callback, - allowed_component_graphs=[ + allowed_component_graphs= { "Name": [ "RF Classifier Select From Model", "Logistic Regression Classifier", ] }, - ], ) automl.search() @@ -644,9 +643,7 @@ def test_automl_allowed_component_graphs_algorithm( mock_algo_init.side_effect = Exception("mock algo init") X, y = X_y_binary - allowed_component_graphs = [ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ] + allowed_component_graphs = {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} with pytest.raises(Exception, match="mock algo init"): AutoMLSearch( X_train=X, @@ -966,33 +963,12 @@ def test_component_graph_with_incorrect_problem_type( X_train=X, y_train=y, problem_type="regression", - allowed_component_graphs=[ + allowed_component_graphs= { "Mock Binary Classification Pipeline": [ dummy_classifier_estimator_class ] - } - ], - ) - - -def test_component_graph_with_nonunique_names( - X_y_binary, dummy_classifier_estimator_class -): - X, y = X_y_binary - - with pytest.raises( - ValueError, match="Every name of allowed_component_graphs must be unique!" - ): - AutoMLSearch( - X_train=X, - y_train=y, - problem_type="binary", - allowed_component_graphs=[ - {"Name_0": [dummy_classifier_estimator_class]}, - {"Name_1": [dummy_classifier_estimator_class]}, - {"Name_0": [dummy_classifier_estimator_class]}, - ], + }, ) @@ -1119,7 +1095,7 @@ def test_add_to_rankings( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_classifier_linear_component_graph], + allowed_component_graphs=dummy_classifier_linear_component_graph, ) automl.search() assert len(automl.rankings) == 1 @@ -1176,7 +1152,7 @@ def test_add_to_rankings_no_search( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[dummy_classifier_linear_component_graph], + allowed_component_graphs=dummy_classifier_linear_component_graph, ) mock_score.return_value = {"Log Loss Binary": 0.5234} @@ -1211,7 +1187,7 @@ def test_add_to_rankings_regression_large( automl = AutoMLSearch( X_train=X, y_train=y, - allowed_component_graphs=[dummy_regressor_linear_component_graph], + allowed_component_graphs=dummy_regressor_linear_component_graph, problem_type="regression", max_time=1, max_iterations=1, @@ -1254,7 +1230,7 @@ def test_add_to_rankings_regression( automl = AutoMLSearch( X_train=X, y_train=y, - allowed_component_graphs=[dummy_regressor_linear_component_graph], + allowed_component_graphs=dummy_regressor_linear_component_graph, problem_type="regression", max_time=1, max_iterations=1, @@ -1286,7 +1262,7 @@ def test_add_to_rankings_duplicate( problem_type="binary", optimize_thresholds=False, max_iterations=1, - allowed_component_graphs=[dummy_classifier_linear_component_graph], + allowed_component_graphs=dummy_classifier_linear_component_graph, ) automl.search() best_pipeline = automl.best_pipeline @@ -1309,10 +1285,10 @@ def test_add_to_rankings_duplicate( def test_add_to_rankings_trained( mock_fit, mock_score, - dummy_classifier_estimator_class, mock_predict_proba, mock_encode, mock_threshold, + dummy_classifier_estimator_class, dummy_binary_pipeline_class, X_y_binary, ): @@ -1329,10 +1305,10 @@ class CoolBinaryClassificationPipeline(dummy_binary_pipeline_class): y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[ - {"Cool Binary Classification Pipeline": [dummy_classifier_estimator_class]}, - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, - ], + allowed_component_graphs={ + "Cool Binary Classification Pipeline": [dummy_classifier_estimator_class], + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class], + }, ) automl.search() assert len(automl.rankings) == 1 @@ -1900,7 +1876,7 @@ def test_pipelines_in_batch_return_nan( y_train=y, problem_type="binary", max_batches=3, - allowed_component_graphs=[{"Name": [dummy_classifier_estimator_class]}], + allowed_component_graphs={"Name": [dummy_classifier_estimator_class]}, n_jobs=1, ) with pytest.raises( @@ -1944,7 +1920,7 @@ def test_pipelines_in_batch_return_none( y_train=y, problem_type="binary", max_batches=3, - allowed_component_graphs=[{"Name": [dummy_classifier_estimator_class]}], + allowed_component_graphs={"Name": [dummy_classifier_estimator_class]}, n_jobs=1, ) with pytest.raises( @@ -2328,7 +2304,7 @@ def test_time_series_regression_with_parameters(ts_data): X_train=X, y_train=y, problem_type="time series regression", - allowed_component_graphs=[{"Name_0": ["Imputer", "Linear Regressor"]}], + allowed_component_graphs={"Name_0": ["Imputer", "Linear Regressor"]}, objective="auto", problem_configuration=problem_configuration, max_batches=3, @@ -2336,15 +2312,23 @@ def test_time_series_regression_with_parameters(ts_data): assert automl.allowed_pipelines[0].parameters["pipeline"] == problem_configuration -@pytest.mark.parametrize("graph_type", ["linear", "dict"]) +@pytest.mark.parametrize("graph_type", ["linear", "dict", "cg"]) def test_automl_accepts_component_graphs( graph_type, dummy_classifier_estimator_class, X_y_binary ): X, y = X_y_binary if graph_type == "linear": - component_graph = ComponentGraph().from_list( - ["Imputer", "Logistic Regression Classifier"] - ) + component_graph = ["Imputer", "Logistic Regression Classifier"] + component_graph_obj = ComponentGraph.from_list(component_graph) + elif graph_type == "dict": + component_graph = { + "imputer": ["Imputer"], + "ohe": ["One Hot Encoder", "imputer.x"], + "estimator_1": ["Random Forest Classifier", "ohe.x"], + "estimator_2": ["Decision Tree Classifier", "ohe.x"], + "final": ["Logistic Regression Classifier", "estimator_1", "estimator_2"], + } + component_graph_obj = ComponentGraph(component_graph) else: component_dict = { "imputer": ["Imputer"], @@ -2354,17 +2338,18 @@ def test_automl_accepts_component_graphs( "final": ["Logistic Regression Classifier", "estimator_1", "estimator_2"], } component_graph = ComponentGraph(component_dict) + component_graph_obj = component_graph automl = AutoMLSearch( X_train=X, y_train=y, problem_type="binary", - allowed_component_graphs=[{"Dummy_Name": component_graph}], + allowed_component_graphs={"Dummy_Name": component_graph}, objective="auto", max_batches=3, ) for pipeline_ in automl.allowed_pipelines: assert isinstance(pipeline_, BinaryClassificationPipeline) - assert pipeline_.component_graph == component_graph + assert pipeline_.component_graph == component_graph_obj @pytest.mark.parametrize("fold_scores", [[2, 4, 6], [np.nan, 4, 6]]) @@ -2687,7 +2672,7 @@ def test_automl_one_allowed_component_graph_ensembling_disabled( y_train=y, problem_type="binary", max_iterations=max_iterations, - allowed_component_graphs=[allowed_component_graph], + allowed_component_graphs=allowed_component_graph, optimize_thresholds=False, ensembling=True, ) @@ -2962,9 +2947,10 @@ def test_iterative_algorithm_pipeline_hyperparameters_make_pipeline_other_errors } estimators = get_estimators("multiclass", [ModelFamily.EXTRA_TREES]) - component_graphs = [ - {f"CG_{ind}": [estimator]} for ind, estimator in enumerate(estimators) - ] + component_graphs = {} + for ind, estimator in enumerate(estimators): + component_graphs[f"CG_{ind}"] = [estimator] + automl = AutoMLSearch( X_train=X, y_train=y, @@ -3005,15 +2991,13 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( custom_hyperparameters_ = None if component_graphs: - component_graph_ = [ - { + component_graph_ = { "Name_0": [ "Drop Columns Transformer", "Imputer", "Random Forest Classifier", ] } - ] if automl_parameters: automl_parameters_ = { @@ -3126,11 +3110,11 @@ def __init__(self, n_jobs=-1, random_seed=0): problem_type="binary", n_jobs=3, max_batches=2, - allowed_component_graphs=[ - {"Pipeline 1": [MockEstimatorWithNJobs]}, - {"Pipeline 2": [MockEstimatorWithNJobs]}, - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, - ], + allowed_component_graphs={ + "Pipeline 1": [MockEstimatorWithNJobs], + "Pipeline 2": [MockEstimatorWithNJobs], + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class], + }, optimize_thresholds=False, ) automl.search() @@ -3337,15 +3321,13 @@ def test_automl_respects_random_seed( X_train=X, y_train=y, problem_type="binary", - allowed_component_graphs=[ - {"Name_0": [dummy_classifier_estimator_class], "random_seed": 0} - ], + allowed_component_graphs={"Name_0": [dummy_classifier_estimator_class]}, optimize_thresholds=False, random_seed=42, max_iterations=10, ) automl.search() - assert automl.allowed_pipelines[0].random_seed == 0 + assert automl.allowed_pipelines[0].random_seed == 42 @pytest.mark.parametrize( @@ -3844,10 +3826,10 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components( X, y, problem_type="binary", - allowed_component_graphs=[ - {"Pipeline from dict": component_graph_dict}, - {"Pipeline from linear": component_graph_linear}, - ], + allowed_component_graphs={ + "Pipeline from dict": component_graph_dict, + "Pipeline from linear": component_graph_linear, + }, pipeline_parameters={ "Imputer": {"numeric_impute_strategy": "most_frequent"}, "Imputer_1": {"numeric_impute_strategy": "median"}, @@ -3875,10 +3857,10 @@ def test_automl_respects_pipeline_parameters_with_duplicate_components( X, y, problem_type="binary", - allowed_component_graphs=[ - {"Pipeline from dict": component_graph_dict}, - {"Pipeline from linear": component_graph_linear}, - ], + allowed_component_graphs={ + "Pipeline from dict": component_graph_dict, + "Pipeline from linear": component_graph_linear, + }, pipeline_parameters={ "One Hot Encoder": {"top_n": 15}, "One Hot Encoder_1": {"top_n": 25}, @@ -3934,7 +3916,7 @@ def test_automl_respects_pipeline_custom_hyperparameters_with_duplicate_componen X, y, problem_type="binary", - allowed_component_graphs=[component_graph], + allowed_component_graphs=component_graph, custom_hyperparameters=custom_hyperparameters, optimize_thresholds=False, max_batches=5, @@ -3979,40 +3961,36 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams( ): X, y = X_y_binary - component_graph_0 = { - "Pipe Line One": { + component_graph_1 = { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"], } - } - component_graph_1 = { - "Pipe Line Two": [ - "Imputer", - "Imputer", - "One Hot Encoder", - "Random Forest Classifier", - ] - } - component_graph_2 = { - "Pipe Line Three": [ + + component_graph_2 = [ "Imputer", "Imputer", "One Hot Encoder", "Random Forest Classifier", ] - } + + component_graph_3 = [ + "Imputer", + "Imputer", + "One Hot Encoder", + "Random Forest Classifier", + ] automl = AutoMLSearch( X, y, problem_type="binary", - allowed_component_graphs=[ - component_graph_0, - component_graph_1, - component_graph_2, - ], + allowed_component_graphs={ + "Pipe Line One": component_graph_1, + "Pipe Line Two": component_graph_2, + "Pipe Line Three": component_graph_3, + }, pipeline_parameters={"Imputer": {"numeric_impute_strategy": "most_frequent"}}, custom_hyperparameters={ "One Hot Encoder": {"top_n": Categorical([12, 10])}, @@ -4456,9 +4434,7 @@ def test_score_batch_works( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ], + allowed_component_graphs={"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, optimize_thresholds=False, ) @@ -4524,9 +4500,7 @@ class Pipeline2(dummy_binary_pipeline_class): y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ], + allowed_component_graphs={"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, ) with pytest.raises( @@ -4552,9 +4526,7 @@ def test_score_batch_before_fitting_yields_error_nan_scores( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ], + allowed_component_graphs= {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, ) scored_pipelines = automl.score_pipelines( @@ -4714,35 +4686,31 @@ def test_automl_validates_data_passed_in_to_allowed_component_graphs( with pytest.raises( ValueError, - match="Parameter allowed_component_graphs must be either None or a list!", + match="Parameter allowed_component_graphs must be either None or a dictionary!", ): AutoMLSearch( X, y, problem_type="binary", - allowed_component_graphs={ - "Mock Binary Classification Pipeline": [ + allowed_component_graphs=[ + {"Mock Binary Classification Pipeline": [ dummy_classifier_estimator_class - ] - }, + ]} + ], ) with pytest.raises( ValueError, - match="Every component graph passed must be of type dictionary or ComponentGraph!", + match="Every component graph passed must be of type list, dictionary, or ComponentGraph!", ): AutoMLSearch( X, y, problem_type="binary", - allowed_component_graphs=[ - "Mock Binary Classification Pipeline", + allowed_component_graphs= { - "Mock Binary Classification Pipeline": [ - dummy_classifier_estimator_class - ] + "Mock Binary Classification Pipeline": dummy_classifier_estimator_class }, - ], ) diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py index a26ca0e780..68583baedd 100644 --- a/evalml/tests/automl_tests/test_automl_search_classification.py +++ b/evalml/tests/automl_tests/test_automl_search_classification.py @@ -1045,10 +1045,8 @@ def test_automl_component_graphs_search( mock_multi_score, is_linear, problem_type, - dummy_binary_linear_component_graph, - dummy_binary_dict_component_graph, - dummy_multiclass_linear_component_graph, - dummy_multiclass_dict_component_graph, + dummy_classifier_linear_component_graph, + dummy_classifier_dict_component_graph, X_y_binary, X_y_multi, ): @@ -1057,18 +1055,18 @@ def test_automl_component_graphs_search( mock_binary_score.return_value = {"Log Loss Binary": 1.0} expected_mock_class = BinaryClassificationPipeline component_graph = ( - dummy_binary_linear_component_graph + dummy_classifier_linear_component_graph if is_linear - else dummy_binary_dict_component_graph + else dummy_classifier_dict_component_graph ) else: X, y = X_y_multi mock_multi_score.return_value = {"Log Loss Multiclass": 1.0} expected_mock_class = MulticlassClassificationPipeline component_graph = ( - dummy_multiclass_linear_component_graph + dummy_classifier_linear_component_graph if is_linear - else dummy_multiclass_dict_component_graph + else dummy_classifier_dict_component_graph ) start_iteration_callback = MagicMock() diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 52a6951c5f..00bb865757 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -393,22 +393,6 @@ def dummy_classifier_linear_component_graph(dummy_classifier_estimator_class): return component_graph_linear -@pytest.fixture -def dummy_binary_linear_component_graph(dummy_binary_estimator_class): - component_graph_linear = { - "Name": ["Imputer", "One Hot Encoder", dummy_binary_estimator_class] - } - return component_graph_linear - - -@pytest.fixture -def dummy_multiclass_linear_component_graph(dummy_multiclass_estimator_class): - component_graph_linear = { - "Name": ["Imputer", "One Hot Encoder", dummy_multiclass_estimator_class] - } - return component_graph_linear - - @pytest.fixture def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): component_graph_linear = { @@ -418,24 +402,12 @@ def dummy_regressor_linear_component_graph(dummy_regressor_estimator_class): @pytest.fixture -def dummy_binary_dict_component_graph(dummy_binary_estimator_class): - component_graph_dict = { - "Name": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": [dummy_binary_estimator_class, "Imputer_1"], - } - } - return component_graph_dict - - -@pytest.fixture -def dummy_multiclass_dict_component_graph(dummy_multiclass_estimator_class): +def dummy_classifier_dict_component_graph(dummy_classifier_estimator_class): component_graph_dict = { "Name": { "Imputer": ["Imputer"], "Imputer_1": ["Imputer", "Imputer"], - "Random Forest Classifier": [dummy_multiclass_estimator_class, "Imputer_1"], + "Random Forest Classifier": [dummy_classifier_estimator_class, "Imputer_1"], } } return component_graph_dict From 2e18c65b70b3368282d48ca3f0c73e4cdcaf4178 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 17 Jun 2021 16:57:15 +0100 Subject: [PATCH 80/85] lint updates --- evalml/automl/automl_search.py | 5 +- .../dask_tests/test_automl_dask.py | 30 +------ evalml/tests/automl_tests/test_automl.py | 87 ++++++++++--------- 3 files changed, 55 insertions(+), 67 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index d3b3110510..258d57f8f5 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -504,7 +504,10 @@ def __init__( ] else: self.allowed_pipelines = get_pipelines_from_component_graphs( - self.allowed_component_graphs, self.problem_type, parameters, self.random_seed + self.allowed_component_graphs, + self.problem_type, + parameters, + self.random_seed, ) if self.allowed_pipelines == []: diff --git a/evalml/tests/automl_tests/dask_tests/test_automl_dask.py b/evalml/tests/automl_tests/dask_tests/test_automl_dask.py index 0d43a5a60c..63531b2fb9 100644 --- a/evalml/tests/automl_tests/dask_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/dask_tests/test_automl_dask.py @@ -110,19 +110,8 @@ def test_automl_train_dask_error_callback(X_y_binary_cls, cluster, caplog): engine=parallel_engine, max_iterations=2, ) - automl._automl_algorithm = IterativeAlgorithm( - max_iterations=2, - allowed_pipelines=pipelines, - tuner_class=SKOptTuner, - random_seed=0, - n_jobs=-1, - number_features=X.shape[1], - pipelines_per_batch=5, - ensembling=False, - text_in_ensembling=False, - pipeline_params={}, - custom_hyperparameters=None, - ) + automl.allowed_pipelines = pipelines + automl.train_pipelines(pipelines) assert "Train error for PipelineWithError: Yikes" in caplog.text @@ -142,19 +131,8 @@ def test_automl_score_dask_error_callback(X_y_binary_cls, cluster, caplog): engine=parallel_engine, max_iterations=2, ) - automl._automl_algorithm = IterativeAlgorithm( - max_iterations=2, - allowed_pipelines=pipelines, - tuner_class=SKOptTuner, - random_seed=0, - n_jobs=-1, - number_features=X.shape[1], - pipelines_per_batch=5, - ensembling=False, - text_in_ensembling=False, - pipeline_params={}, - custom_hyperparameters=None, - ) + automl.allowed_pipelines = pipelines + automl.score_pipelines( pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"] ) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 24d9b0d13d..7f5cb3341f 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -556,13 +556,12 @@ def test_automl_feature_selection( problem_type="binary", max_iterations=2, start_iteration_callback=start_iteration_callback, - allowed_component_graphs= - { - "Name": [ - "RF Classifier Select From Model", - "Logistic Regression Classifier", - ] - }, + allowed_component_graphs={ + "Name": [ + "RF Classifier Select From Model", + "Logistic Regression Classifier", + ] + }, ) automl.search() @@ -643,7 +642,9 @@ def test_automl_allowed_component_graphs_algorithm( mock_algo_init.side_effect = Exception("mock algo init") X, y = X_y_binary - allowed_component_graphs = {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} + allowed_component_graphs = { + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + } with pytest.raises(Exception, match="mock algo init"): AutoMLSearch( X_train=X, @@ -963,12 +964,11 @@ def test_component_graph_with_incorrect_problem_type( X_train=X, y_train=y, problem_type="regression", - allowed_component_graphs= - { - "Mock Binary Classification Pipeline": [ - dummy_classifier_estimator_class - ] - }, + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [ + dummy_classifier_estimator_class + ] + }, ) @@ -2992,12 +2992,12 @@ def test_iterative_algorithm_pipeline_custom_hyperparameters_make_pipeline( if component_graphs: component_graph_ = { - "Name_0": [ - "Drop Columns Transformer", - "Imputer", - "Random Forest Classifier", - ] - } + "Name_0": [ + "Drop Columns Transformer", + "Imputer", + "Random Forest Classifier", + ] + } if automl_parameters: automl_parameters_ = { @@ -3962,18 +3962,18 @@ def test_automl_adds_pipeline_parameters_to_custom_pipeline_hyperparams( X, y = X_y_binary component_graph_1 = { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], - "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"], - } + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + "One Hot Encoder": ["One Hot Encoder", "Imputer_1"], + "Random Forest Classifier": ["Random Forest Classifier", "One Hot Encoder"], + } component_graph_2 = [ - "Imputer", - "Imputer", - "One Hot Encoder", - "Random Forest Classifier", - ] + "Imputer", + "Imputer", + "One Hot Encoder", + "Random Forest Classifier", + ] component_graph_3 = [ "Imputer", @@ -4434,7 +4434,9 @@ def test_score_batch_works( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs={"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + }, optimize_thresholds=False, ) @@ -4500,7 +4502,9 @@ class Pipeline2(dummy_binary_pipeline_class): y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs={"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + }, ) with pytest.raises( @@ -4526,7 +4530,9 @@ def test_score_batch_before_fitting_yields_error_nan_scores( y_train=y, problem_type="binary", max_iterations=1, - allowed_component_graphs= {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]}, + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + }, ) scored_pipelines = automl.score_pipelines( @@ -4693,9 +4699,11 @@ def test_automl_validates_data_passed_in_to_allowed_component_graphs( y, problem_type="binary", allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [ - dummy_classifier_estimator_class - ]} + { + "Mock Binary Classification Pipeline": [ + dummy_classifier_estimator_class + ] + } ], ) @@ -4707,10 +4715,9 @@ def test_automl_validates_data_passed_in_to_allowed_component_graphs( X, y, problem_type="binary", - allowed_component_graphs= - { - "Mock Binary Classification Pipeline": dummy_classifier_estimator_class - }, + allowed_component_graphs={ + "Mock Binary Classification Pipeline": dummy_classifier_estimator_class + }, ) From 352f8d000ee6296c47130d42a86504238721adf1 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 17 Jun 2021 16:59:42 +0100 Subject: [PATCH 81/85] docs update --- docs/source/user_guide/automl.ipynb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index 3dd1fd5bdc..fb2f7b34d0 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -220,7 +220,7 @@ "source": [ "### Using custom pipelines\n", "\n", - "EvalML's AutoML algorithm generates a set of pipelines to search with. To provide a custom set instead, set allowed_component_graphs to a list of custom component graphs. `AutoMLSearch` will use these to generate `Pipeline` instances. Note: this will prevent AutoML from generating other pipelines to search over." + "EvalML's AutoML algorithm generates a set of pipelines to search with. To provide a custom set instead, set allowed_component_graphs to a dictionary of custom component graphs. `AutoMLSearch` will use these to generate `Pipeline` instances. Note: this will prevent AutoML from generating other pipelines to search over." ] }, { @@ -235,9 +235,8 @@ "automl_custom = evalml.automl.AutoMLSearch(X_train=X_train,\n", " y_train=y_train,\n", " problem_type='multiclass',\n", - " allowed_component_graphs=[{\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier'],\n", - " \"random_seed\": 42},\n", - " {\"My_other_pipeline\": ['One Hot Encoder', 'Random Forest Classifier']}])" + " allowed_component_graphs={\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']},\n", + " {\"My_other_pipeline\": ['One Hot Encoder', 'Random Forest Classifier']})" ] }, { From da5bd00138ac2675ce8f99e55234a19ccb5dcca2 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 18 Jun 2021 11:55:17 +0100 Subject: [PATCH 82/85] lint fixes and updates --- .../test_automl_search_classification.py | 38 +++++++++---------- .../test_automl_search_regression.py | 14 +++---- .../tests/automl_tests/test_automl_utils.py | 18 ++++----- evalml/tests/automl_tests/test_engine_base.py | 12 +++--- 4 files changed, 38 insertions(+), 44 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py index 68583baedd..a11cb35646 100644 --- a/evalml/tests/automl_tests/test_automl_search_classification.py +++ b/evalml/tests/automl_tests/test_automl_search_classification.py @@ -719,9 +719,9 @@ def test_automl_component_graphs_specified_component_graphs_binary( X_train=X, y_train=y, problem_type="binary", - allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ], + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + }, optimize_thresholds=False, allowed_model_families=None, ) @@ -758,13 +758,11 @@ def test_automl_component_graphs_specified_component_graphs_multi( X_train=X, y_train=y, problem_type="multiclass", - allowed_component_graphs=[ - { - "Mock Multiclass Classification Pipeline": [ - dummy_classifier_estimator_class - ] - } - ], + allowed_component_graphs={ + "Mock Multiclass Classification Pipeline": [ + dummy_classifier_estimator_class + ] + }, allowed_model_families=None, ) expected_pipeline = dummy_multiclass_pipeline_class({}) @@ -964,9 +962,9 @@ def test_automl_component_graphs_init_allowed_both_specified_binary( X_train=X, y_train=y, problem_type="binary", - allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ], + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + }, allowed_model_families=[ModelFamily.RANDOM_FOREST], optimize_thresholds=False, ) @@ -1004,13 +1002,11 @@ def test_automl_component_graphs_init_allowed_both_specified_multi( X_train=X, y_train=y, problem_type="multiclass", - allowed_component_graphs=[ - { - "Mock Multiclass Classification Pipeline": [ - dummy_classifier_estimator_class - ] - } - ], + allowed_component_graphs={ + "Mock Multiclass Classification Pipeline": [ + dummy_classifier_estimator_class + ] + }, allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -1076,7 +1072,7 @@ def test_automl_component_graphs_search( problem_type=problem_type, max_iterations=5, start_iteration_callback=start_iteration_callback, - allowed_component_graphs=[component_graph], + allowed_component_graphs=component_graph, optimize_thresholds=False, ) automl.search() diff --git a/evalml/tests/automl_tests/test_automl_search_regression.py b/evalml/tests/automl_tests/test_automl_search_regression.py index 3688466eb0..49d763002c 100644 --- a/evalml/tests/automl_tests/test_automl_search_regression.py +++ b/evalml/tests/automl_tests/test_automl_search_regression.py @@ -242,9 +242,9 @@ def test_automl_allowed_component_graphs_specified_component_graphs( X_train=X, y_train=y, problem_type="regression", - allowed_component_graphs=[ - {"Mock Regression Pipeline": [dummy_regressor_estimator_class]} - ], + allowed_component_graphs={ + "Mock Regression Pipeline": [dummy_regressor_estimator_class] + }, allowed_model_families=None, ) mock_score.return_value = {automl.objective.name: 1.0} @@ -357,9 +357,9 @@ def test_automl_allowed_component_graphs_init_allowed_both_specified( X_train=X, y_train=y, problem_type="regression", - allowed_component_graphs=[ - {"Mock Regression Pipeline": [dummy_regressor_estimator_class]} - ], + allowed_component_graphs={ + "Mock Regression Pipeline": [dummy_regressor_estimator_class] + }, allowed_model_families=[ModelFamily.RANDOM_FOREST], ) mock_score.return_value = {automl.objective.name: 1.0} @@ -399,7 +399,7 @@ def test_automl_allowed_component_graphs_search( problem_type="regression", max_iterations=2, start_iteration_callback=start_iteration_callback, - allowed_component_graphs=[component_graph], + allowed_component_graphs=component_graph, ) automl.search() diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 5cffef7462..191e152d68 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -333,16 +333,14 @@ def test_get_hyperparameter_ranges(): ], ) def test_get_pipelines_from_component_graphs(problem_type, estimator): - component_graphs = [ - {"Name_0": ["Imputer", estimator], "random_seed": 42}, - { - "Name_1": { - "Imputer": ["Imputer"], - "Imputer_1": ["Imputer", "Imputer"], - estimator: [estimator, "Imputer_1"], - } + component_graphs = { + "Name_0": ["Imputer", estimator], + "Name_1": { + "Imputer": ["Imputer"], + "Imputer_1": ["Imputer", "Imputer"], + estimator: [estimator, "Imputer_1"], }, - ] + } if problem_type == "time series regression": with pytest.raises(ValueError, match="date_index, gap, and max_delay"): get_pipelines_from_component_graphs(component_graphs, problem_type) @@ -350,7 +348,7 @@ def test_get_pipelines_from_component_graphs(problem_type, estimator): returned_pipelines = get_pipelines_from_component_graphs( component_graphs, problem_type ) - assert returned_pipelines[0].random_seed == 42 + assert returned_pipelines[0].random_seed == 0 assert returned_pipelines[1].random_seed == 0 if problem_type == "binary": assert all( diff --git a/evalml/tests/automl_tests/test_engine_base.py b/evalml/tests/automl_tests/test_engine_base.py index f9c9773be4..36fe9741cd 100644 --- a/evalml/tests/automl_tests/test_engine_base.py +++ b/evalml/tests/automl_tests/test_engine_base.py @@ -29,9 +29,9 @@ def test_train_and_score_pipelines( problem_type="binary", max_time=1, max_batches=1, - allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ], + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + }, optimize_thresholds=False, ) pipeline = dummy_binary_pipeline_class({}) @@ -74,9 +74,9 @@ def test_train_and_score_pipelines_error( problem_type="binary", max_time=1, max_batches=1, - allowed_component_graphs=[ - {"Mock Binary Classification Pipeline": [dummy_classifier_estimator_class]} - ], + allowed_component_graphs={ + "Mock Binary Classification Pipeline": [dummy_classifier_estimator_class] + }, optimize_thresholds=False, ) pipeline = dummy_binary_pipeline_class({}) From a9e22f63dfe0bdd51bb93d92cdff5d3cf2e23967 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 18 Jun 2021 12:15:29 +0100 Subject: [PATCH 83/85] test fix --- evalml/tests/automl_tests/test_automl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 7f5cb3341f..2db6a518d0 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1078,10 +1078,10 @@ def test_default_objective(X_y_binary): def test_add_to_rankings( mock_fit, mock_score, - dummy_classifier_linear_component_graph, mock_predict_proba, mock_encode, mock_threshold, + dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary, ): @@ -1139,10 +1139,10 @@ def test_add_to_rankings( def test_add_to_rankings_no_search( mock_fit, mock_score, - dummy_classifier_linear_component_graph, mock_predict_proba, mock_encode, mock_threshold, + dummy_classifier_linear_component_graph, dummy_binary_pipeline_class, X_y_binary, ): From 2422aaf69981ae920bf023888f85a6af0900cfca Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 18 Jun 2021 13:45:01 +0100 Subject: [PATCH 84/85] remove unused fixtures --- docs/source/user_guide/automl.ipynb | 4 +-- evalml/tests/conftest.py | 38 ----------------------------- 2 files changed, 2 insertions(+), 40 deletions(-) diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index fb2f7b34d0..e003a71391 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -235,8 +235,8 @@ "automl_custom = evalml.automl.AutoMLSearch(X_train=X_train,\n", " y_train=y_train,\n", " problem_type='multiclass',\n", - " allowed_component_graphs={\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier']},\n", - " {\"My_other_pipeline\": ['One Hot Encoder', 'Random Forest Classifier']})" + " allowed_component_graphs={\"My_pipeline\": ['Simple Imputer', 'Random Forest Classifier'],\n", + " \"My_other_pipeline\": ['One Hot Encoder', 'Random Forest Classifier']})" ] }, { diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 2a6eb20e24..4c094870bc 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -356,44 +356,6 @@ def fit(self, X, y): return MockEstimator -@pytest.fixture -def dummy_binary_estimator_class(): - class MockEstimator(Estimator): - name = "Mock Binary Classifier" - model_family = ModelFamily.NONE - supported_problem_types = [ - ProblemTypes.BINARY, - ProblemTypes.TIME_SERIES_BINARY, - ] - hyperparameter_ranges = {"a": Integer(0, 10), "b": Real(0, 10)} - - def __init__(self, a=1, b=0, random_seed=0): - super().__init__( - parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed - ) - - return MockEstimator - - -@pytest.fixture -def dummy_multiclass_estimator_class(): - class MockEstimator(Estimator): - name = "Mock Multiclass Classifier" - model_family = ModelFamily.NONE - supported_problem_types = [ - ProblemTypes.MULTICLASS, - ProblemTypes.TIME_SERIES_MULTICLASS, - ] - hyperparameter_ranges = {"a": Integer(0, 10), "b": Real(0, 10)} - - def __init__(self, a=1, b=0, random_seed=0): - super().__init__( - parameters={"a": a, "b": b}, component_obj=None, random_seed=random_seed - ) - - return MockEstimator - - @pytest.fixture def dummy_classifier_linear_component_graph(dummy_classifier_estimator_class): component_graph_linear = { From 86e788c8093e8f9b850dd4ce923a9526de8bd50f Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 18 Jun 2021 15:25:15 +0100 Subject: [PATCH 85/85] docstring fix --- evalml/automl/automl_search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index f00ca14098..712fab1ebd 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -192,8 +192,7 @@ def __init__( Only applicable if patience is not None. Defaults to None. allowed_component_graphs (dict): A dictionary of lists or ComponentGraphs indicating the component graphs allowed in the search. - The format should follow { "Name_0": [list_of_components], - "Name_1": [ComponentGraph(...)] } + The format should follow { "Name_0": [list_of_components], "Name_1": [ComponentGraph(...)] } The default of None indicates all pipeline component graphs for this problem type are allowed. Setting this field will cause allowed_model_families to be ignored.