Skip to content

Commit

Permalink
Fixing broken automl tests and updating release notes for PR 1050.
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyaboulton committed Aug 13, 2020
1 parent f9c3faa commit 2d15079
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Release Notes
* Added `explain_predictions` and `explain_predictions_best_worst` for explaining multiple predictions with SHAP :pr:`1016`
* Added new LSA component for text featurization :pr:`1022`
* Added guide on installing with conda :pr:`1041`
* Added `percent_better_than_baseline` to Automl search rankings and full rankings table :pr:`1050`
* Fixes
* Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
* Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
Expand Down
5 changes: 3 additions & 2 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def __init__(self,
self.allowed_model_families = allowed_model_families
self._automl_algorithm = None
self._start = None
self._baseline_score = None

self._validate_problem_type()

Expand Down Expand Up @@ -528,7 +529,7 @@ def _add_baseline_pipelines(self, X, y):
self._start)

baseline_results = self._compute_cv_scores(baseline, X, y)
self.baseline_score = baseline_results["cv_score_mean"]
self._baseline_score = baseline_results["cv_score_mean"]
self._add_result(trained_pipeline=baseline,
parameters=baseline.parameters,
training_time=baseline_results['training_time'],
Expand Down Expand Up @@ -615,7 +616,7 @@ def _compute_cv_scores(self, pipeline, X, y):

def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_scores):
cv_score = cv_scores.mean()
percent_better = np.round(self.objective.calculate_percent_difference(cv_score, self.baseline_score), 2)
percent_better = np.round(self.objective.calculate_percent_difference(cv_score, self._baseline_score), 2)
# calculate high_variance_cv
# if the coefficient of variance is greater than .2
with warnings.catch_warnings():
Expand Down
14 changes: 8 additions & 6 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def test_search_results(X_y_regression, X_y_binary, X_y_multi, automl_type):
assert automl.results['search_order'] == [0, 1]
assert len(automl.results['pipeline_results']) == 2
for pipeline_id, results in automl.results['pipeline_results'].items():
assert results.keys() == {'id', 'pipeline_name', 'pipeline_class', 'pipeline_summary', 'parameters', 'score', 'high_variance_cv', 'training_time', 'cv_data'}
assert results.keys() == {'id', 'pipeline_name', 'pipeline_class', 'pipeline_summary', 'parameters', 'score', 'high_variance_cv', 'training_time',
'cv_data', 'percent_better_than_baseline'}
assert results['id'] == pipeline_id
assert isinstance(results['pipeline_name'], str)
assert issubclass(results['pipeline_class'], expected_pipeline_class)
Expand All @@ -77,11 +78,11 @@ def test_search_results(X_y_regression, X_y_binary, X_y_multi, automl_type):
assert isinstance(automl.rankings, pd.DataFrame)
assert isinstance(automl.full_rankings, pd.DataFrame)
assert np.all(automl.rankings.dtypes == pd.Series(
[np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))
[np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
index=['id', 'pipeline_name', 'score', 'percent_better_than_baseline', 'high_variance_cv', 'parameters']))
assert np.all(automl.full_rankings.dtypes == pd.Series(
[np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))
[np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
index=['id', 'pipeline_name', 'score', 'percent_better_than_baseline', 'high_variance_cv', 'parameters']))


@pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION])
Expand Down Expand Up @@ -729,7 +730,8 @@ def test_no_search():
assert isinstance(automl.rankings, pd.DataFrame)
assert isinstance(automl.full_rankings, pd.DataFrame)

df_columns = ["id", "pipeline_name", "score", "high_variance_cv", "parameters"]
df_columns = ["id", "pipeline_name", "score", "percent_better_than_baseline",
"high_variance_cv", "parameters"]
assert (automl.rankings.columns == df_columns).all()
assert (automl.full_rankings.columns == df_columns).all()

Expand Down

0 comments on commit 2d15079

Please sign in to comment.