Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update our Logger to use python's logging package, revised #763

Merged
merged 18 commits into from May 21, 2020
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Expand Up @@ -14,6 +14,7 @@ Changelog
* Added __str__ for AutoSearch object :pr:`675`
* Add util methods to graph ROC and confusion matrix :pr:`720`
* Refactor `AutoBase` to `AutoSearchBase` :pr:`758`
* Updated our logger to use Python's logging utils :pr:`763`
* Refactor most of `AutoSearchBase._do_iteration` impl into `AutoSearchBase._evaluate` :pr:`762`
* Expanded `import_or_raise` to catch all exceptions :pr:`759`
* Don't allow `Recall` to be used as an objective for AutoML :pr:`784`
Expand Down
51 changes: 23 additions & 28 deletions evalml/automl/auto_search_base.py
Expand Up @@ -17,9 +17,10 @@
from evalml.pipelines.components import handle_component
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.tuners import SKOptTuner
from evalml.utils import Logger, convert_to_seconds, get_random_state
from evalml.utils import convert_to_seconds, get_random_state
from evalml.utils.logger import get_logger, log_subtitle, log_title

logger = Logger()
logger = get_logger(__file__)


class AutoSearchBase:
Expand All @@ -46,8 +47,6 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
if self.problem_type != self.objective.problem_type:
raise ValueError("Given objective {} is not compatible with a {} problem.".format(self.objective.name, self.problem_type.value))

logger.verbose = verbose

if additional_objectives is not None:
additional_objectives = [get_objective(o) for o in additional_objectives]
else:
Expand Down Expand Up @@ -94,7 +93,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
self.plot = PipelineSearchPlots(self)

except ImportError:
logger.log("Warning: unable to import plotly; skipping pipeline search plotting\n")
logger.warning("Unable to import plotly; skipping pipeline search plotting\n")
self.plot = None

def __str__(self):
Expand Down Expand Up @@ -174,30 +173,30 @@ def search(self, X, y, feature_types=None, raise_errors=True, show_iteration_plo
if self.problem_type != ProblemTypes.REGRESSION:
self._check_multiclass(y)

logger.log_title("Beginning pipeline search")
logger.log("Optimizing for %s. " % self.objective.name, new_line=False)
log_title(logger, "Beginning pipeline search")
logger.info("Optimizing for %s. " % self.objective.name)

if self.objective.greater_is_better:
logger.log("Greater score is better.\n")
logger.info("Greater score is better.\n")
else:
logger.log("Lower score is better.\n")
logger.info("Lower score is better.\n")

# Set default max_pipeline if none specified
if self.max_pipelines is None and self.max_time is None:
self.max_pipelines = 5
logger.log("No search limit is set. Set using max_time or max_pipelines.\n")
logger.info("No search limit is set. Set using max_time or max_pipelines.\n")

if self.max_pipelines:
logger.log("Searching up to %s pipelines. " % self.max_pipelines)
logger.info("Searching up to %s pipelines. " % self.max_pipelines)
if self.max_time:
logger.log("Will stop searching for new pipelines after %d seconds.\n" % self.max_time)
logger.log("Possible model families: %s\n" % ", ".join([model.value for model in self.possible_model_families]))
logger.info("Will stop searching for new pipelines after %d seconds.\n" % self.max_time)
logger.info("Possible model families: %s\n" % ", ".join([model.value for model in self.possible_model_families]))

if self.detect_label_leakage:
leaked = guardrails.detect_label_leakage(X, y)
if len(leaked) > 0:
leaked = [str(k) for k in leaked.keys()]
logger.log("WARNING: Possible label leakage: %s" % ", ".join(leaked))
logger.warning("Possible label leakage: %s" % ", ".join(leaked))

search_iteration_plot = None
if self.plot:
Expand Down Expand Up @@ -235,11 +234,7 @@ def _check_stopping_condition(self, start):
elapsed = time.time() - start
if self.max_time and elapsed >= self.max_time:
return False
elif self.max_pipelines:
if num_pipelines >= self.max_pipelines:
return False
elif self.max_time and elapsed >= self.max_time:
logger.log("\n\nMax time elapsed. Stopping search early.")
Comment on lines -238 to -242
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the elif and cleaned this up because I don't this this was reachable code (would have hit the first if statement)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! I saw this in codecov a couple weeks ago and tried to clean it up and it kept failing unit tests. Cool that you got it passing

elif self.max_pipelines and num_pipelines >= self.max_pipelines:
return False

# check for early stopping
Expand All @@ -259,7 +254,7 @@ def _check_stopping_condition(self, start):
else:
num_without_improvement += 1
if num_without_improvement >= self.patience:
logger.log("\n\n{} iterations without improvement. Stopping search early...".format(self.patience))
logger.info("\n\n{} iterations without improvement. Stopping search early...".format(self.patience))
return False
return should_continue

Expand Down Expand Up @@ -440,18 +435,18 @@ def describe_pipeline(self, pipeline_id, return_dict=False):
pipeline_results = self.results['pipeline_results'][pipeline_id]

pipeline.describe()
logger.log_subtitle("Training")
logger.log("Training for {} problems.".format(pipeline.problem_type))
log_subtitle(logger, "Training")
logger.info("Training for {} problems.".format(pipeline.problem_type))

if self.optimize_thresholds and self.objective.problem_type == ProblemTypes.BINARY and self.objective.can_optimize_threshold:
logger.log("Objective to optimize binary classification pipeline thresholds for: {}".format(self.objective))
logger.info("Objective to optimize binary classification pipeline thresholds for: {}".format(self.objective))

logger.log("Total training time (including CV): %.1f seconds" % pipeline_results["training_time"])
logger.log_subtitle("Cross Validation", underline="-")
logger.info("Total training time (including CV): %.1f seconds" % pipeline_results["training_time"])
log_subtitle(logger, "Cross Validation", underline="-")

if pipeline_results["high_variance_cv"]:
logger.log("Warning! High variance within cross validation scores. " +
"Model may not perform as estimated on unseen data.")
logger.warning("High variance within cross validation scores. " +
"Model may not perform as estimated on unseen data.")

all_objective_scores = [fold["all_objective_scores"] for fold in pipeline_results["cv_data"]]
all_objective_scores = pd.DataFrame(all_objective_scores)
Expand All @@ -470,7 +465,7 @@ def describe_pipeline(self, pipeline_id, return_dict=False):
all_objective_scores = all_objective_scores.fillna("-")

with pd.option_context('display.float_format', '{:.3f}'.format, 'expand_frame_repr', False):
logger.log(all_objective_scores)
logger.info(all_objective_scores)

if return_dict:
return pipeline_results
Expand Down
8 changes: 4 additions & 4 deletions evalml/pipelines/components/component_base.py
@@ -1,9 +1,9 @@
from abc import ABC, abstractmethod

from evalml.exceptions import MethodPropertyNotFoundError
from evalml.utils import Logger, get_random_state
from evalml.utils import get_logger, get_random_state, log_subtitle

logger = Logger()
logger = get_logger(__file__)


class ComponentBase(ABC):
Expand Down Expand Up @@ -54,10 +54,10 @@ def describe(self, print_name=False, return_dict=False):
"""
if print_name:
title = self.name
logger.log_subtitle(title)
log_subtitle(logger, title)
for parameter in self.parameters:
parameter_str = ("\t * {} : {}").format(parameter, self.parameters[parameter])
logger.log(parameter_str)
logger.info(parameter_str)
if return_dict:
component_dict = {"name": self.name}
component_dict.update({"parameters": self.parameters})
Expand Down
22 changes: 12 additions & 10 deletions evalml/pipelines/pipeline_base.py
Expand Up @@ -11,13 +11,15 @@

from evalml.exceptions import IllFormattedClassNameError
from evalml.utils import (
Logger,
classproperty,
get_logger,
get_random_state,
import_or_raise
import_or_raise,
log_subtitle,
log_title
)

logger = Logger()
logger = get_logger(__file__)


class PipelineBase(ABC):
Expand Down Expand Up @@ -143,18 +145,18 @@ def describe(self):
Returns:
dict: dictionary of all component parameters if return_dict is True, else None
"""
logger.log_title(self.name)
logger.log("Problem Type: {}".format(self.problem_type))
logger.log("Model Family: {}".format(str(self.model_family)))
log_title(logger, self.name)
logger.info("Problem Type: {}".format(self.problem_type))
logger.info("Model Family: {}".format(str(self.model_family)))

if self.estimator.name in self.input_feature_names:
logger.log("Number of features: {}".format(len(self.input_feature_names[self.estimator.name])))
logger.info("Number of features: {}".format(len(self.input_feature_names[self.estimator.name])))

# Summary of steps
logger.log_subtitle("Pipeline Steps")
log_subtitle(logger, "Pipeline Steps")
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
for number, component in enumerate(self.component_graph, 1):
component_string = str(number) + ". " + component.name
logger.log(component_string)
logger.info(component_string)
component.describe(print_name=False)

def _transform(self, X):
Expand Down Expand Up @@ -232,7 +234,7 @@ def _score(X, y, predictions, objective):
try:
score = objective.score(y, predictions, X)
except Exception as e:
logger.log('Error in PipelineBase.score while scoring objective {}: {}'.format(objective.name, str(e)))
logger.error('Error in PipelineBase.score while scoring objective {}: {}'.format(objective.name, str(e)))
return score

@classproperty
Expand Down
14 changes: 6 additions & 8 deletions evalml/tests/automl_tests/test_auto_classification_search.py
Expand Up @@ -251,7 +251,7 @@ def test_additional_objectives(X_y):
@patch('evalml.pipelines.BinaryClassificationPipeline.predict_proba')
@patch('evalml.pipelines.BinaryClassificationPipeline.score')
@patch('evalml.pipelines.PipelineBase.fit')
def test_optimizable_threshold_enabled(mock_fit, mock_score, mock_predict_proba, mock_optimize_threshold, X_y, capsys):
def test_optimizable_threshold_enabled(mock_fit, mock_score, mock_predict_proba, mock_optimize_threshold, X_y, caplog):
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
mock_optimize_threshold.return_value = 0.8
X, y = X_y
automl = AutoClassificationSearch(objective='precision', max_pipelines=1, optimize_thresholds=True)
Expand All @@ -264,9 +264,8 @@ def test_optimizable_threshold_enabled(mock_fit, mock_score, mock_predict_proba,
assert automl.best_pipeline.threshold == 0.8

automl.describe_pipeline(0)
out, err = capsys.readouterr()
out = caplog.text
assert "Objective to optimize binary classification pipeline thresholds for" in out
assert err == ""


@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold')
Expand Down Expand Up @@ -298,20 +297,19 @@ def test_non_optimizable_threshold(mock_fit, mock_score, X_y):
assert automl.best_pipeline.threshold == 0.5


def test_describe_pipeline_objective_ordered(X_y, capsys):
def test_describe_pipeline_objective_ordered(X_y, caplog):
X, y = X_y
automl = AutoClassificationSearch(objective='AUC', max_pipelines=2)
automl.search(X, y)

automl.describe_pipeline(0)
out, err = capsys.readouterr()
out = caplog.text
out_stripped = " ".join(out.split())

objectives = [get_objective(obj) for obj in automl.additional_objectives]
objectives_names = [obj.name for obj in objectives]
expected_objective_order = " ".join(objectives_names)

assert err == ''
assert expected_objective_order in out_stripped


Expand Down Expand Up @@ -340,7 +338,7 @@ def test_max_time_units():
AutoClassificationSearch(objective='F1', max_time=(30, 'minutes'))


def test_early_stopping(capsys):
def test_early_stopping(caplog):
with pytest.raises(ValueError, match='patience value must be a positive integer.'):
automl = AutoClassificationSearch(objective='AUC', max_pipelines=5, allowed_model_families=['linear_model'], patience=-1, random_state=0)

Expand All @@ -360,7 +358,7 @@ def test_early_stopping(capsys):

automl.results = mock_results
automl._check_stopping_condition(time.time())
out, _ = capsys.readouterr()
out = caplog.text
assert "2 iterations without improvement. Stopping search early." in out


Expand Down
4 changes: 2 additions & 2 deletions evalml/tests/automl_tests/test_auto_regression_search.py
Expand Up @@ -89,7 +89,7 @@ def add_result_callback(results, trained_pipeline, counts=counts):
assert counts["add_result_callback"] == max_pipelines


def test_early_stopping(capsys):
def test_early_stopping(caplog):
tolerance = 0.005
patience = 2
automl = AutoRegressionSearch(objective='mse', max_time='60 seconds', patience=patience, tolerance=tolerance, allowed_model_families=['linear_model'], random_state=0)
Expand All @@ -106,7 +106,7 @@ def test_early_stopping(capsys):

automl.results = mock_results
automl._check_stopping_condition(time.time())
out, _ = capsys.readouterr()
out = caplog.text
assert "2 iterations without improvement. Stopping search early." in out


Expand Down
21 changes: 11 additions & 10 deletions evalml/tests/automl_tests/test_autobase.py
Expand Up @@ -9,28 +9,31 @@
from evalml.tuners import RandomSearchTuner


def test_pipeline_limits(capsys, X_y):
def test_pipeline_limits(caplog, X_y):
X, y = X_y

automl = AutoClassificationSearch(multiclass=False, max_pipelines=1)
automl.search(X, y)
out, err = capsys.readouterr()
out = caplog.text
assert "Searching up to 1 pipelines. " in out

caplog.clear()
automl = AutoClassificationSearch(multiclass=False, max_time=1)
automl.search(X, y)
out, err = capsys.readouterr()
out = caplog.text
assert "Will stop searching for new pipelines after 1 seconds" in out

caplog.clear()
automl = AutoClassificationSearch(multiclass=False, max_time=1, max_pipelines=5)
automl.search(X, y)
out, err = capsys.readouterr()
out = caplog.text
assert "Searching up to 5 pipelines. " in out
assert "Will stop searching for new pipelines after 1 seconds" in out

caplog.clear()
automl = AutoClassificationSearch(multiclass=False)
automl.search(X, y)
out, err = capsys.readouterr()
out = caplog.text
assert "No search limit is set. Set using max_time or max_pipelines." in out


Expand Down Expand Up @@ -124,14 +127,14 @@ def test_rankings(X_y, X_y_reg):


@patch('evalml.pipelines.PipelineBase.fit')
@patch('evalml.utils.logger.Logger.log')
@patch('evalml.guardrails.detect_label_leakage')
def test_detect_label_leakage(mock_detect_label_leakage, mock_log, mock_fit, capsys, X_y):
def test_detect_label_leakage(mock_detect_label_leakage, mock_fit, capsys, caplog, X_y):
X, y = X_y
mock_detect_label_leakage.return_value = {'var 1': 0.1234, 'var 2': 0.5678}
automl = AutoClassificationSearch(max_pipelines=1, random_state=0)
automl.search(X, y, raise_errors=False)
mock_log.assert_called_with("WARNING: Possible label leakage: var 1, var 2")
out = caplog.text
assert "Possible label leakage: var 1, var 2" in out


@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
Expand All @@ -154,7 +157,6 @@ def _dummy_callback(param1, param2):
'add_result_callback': None,
'additional_objectives': ['Precision', 'AUC'],
'n_jobs': 2,
'verbose': True,
'optimize_thresholds': True
}

Expand All @@ -173,7 +175,6 @@ def _dummy_callback(param1, param2):
'Additional Objectives': search_params['additional_objectives'],
'Random State': 'RandomState(MT19937)',
'n_jobs': search_params['n_jobs'],
'Verbose': search_params['verbose'],
'Optimize Thresholds': search_params['optimize_thresholds']
}

Expand Down
25 changes: 25 additions & 0 deletions evalml/tests/component_tests/test_components.py
Expand Up @@ -244,3 +244,28 @@ def test_regressor_call_predict_proba(test_classes):
component = MockEstimator({}, None, 0)
with pytest.raises(MethodPropertyNotFoundError):
component.predict_proba(X)

def test_component_describe(X_y, caplog):
X, y = X_y

class MockEstimator():
def fit(self, X, y):
pass

class MockComponent(Estimator):
name = 'Mock Estimator'
model_family = ModelFamily.LINEAR_MODEL
supported_problem_types = ['binary']
hyperparameter_ranges = {}

def __init__(self):
parameters = {}
est = MockEstimator()
super().__init__(parameters=parameters,
component_obj=est,
random_state=0)

est = MockComponent()
est.describe(print_name=True)
out = caplog.text
assert "Mock Estimator" in out