From afd369da7e5e26f33cb1dfda37fcdeafe060acfe Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 21 Feb 2023 15:04:24 -0500 Subject: [PATCH 1/5] first pass with test --- evalml/pipelines/utils.py | 69 +++++++++++++++++++ .../pipeline_tests/test_pipeline_utils.py | 24 +++++++ 2 files changed, 93 insertions(+) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index e4d212e218..bfdbffb0dd 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -638,6 +638,75 @@ def generate_pipeline_code(element): return "\n".join(code_strings) +def generate_pipeline_example( + pipeline, + path_to_train, + path_to_holdout, + target, + path_to_mapping="", + output_file_path=None, +): + """Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline. + + Args: + pipeline (pipeline instance): The instance of the pipeline to generate string Python code. + path_to_train (str): path to training data. + path_to_holdout (str): path to holdout data. + target (str): target variable. + path_to_mapping (str): path to mapping json + output_file_path (str): path to output python file. + + Returns: + str: String representation of Python code that can be run separately in order to recreate the pipeline instance. + Does not include code for custom component implementation. + + """ + output_str = f""" +import evalml +import woodwork +import pandas as pd + +PATH_TO_TRAIN = "{path_to_train}" +PATH_TO_HOLDOUT = "{path_to_holdout}" +TARGET = "{target}" +column_mapping = "{path_to_mapping}" + +# This is the machine learning pipeline you have exported. +# By running this code you will fit the pipeline on the files provided +# and you can then use this pipeline for prediction and model understanding. +{generate_pipeline_code(pipeline)} + +print(pipeline.name) +print(pipeline.parameters) +pipeline.describe() + +df = pd.read_csv(PATH_TO_TRAIN) +y_train = df[TARGET] +X_train = df.drop(TARGET, axis=1) + +pipeline.fit(X_train, y_train) + +# You can now generate predictions as well as run model understanding. +df = pd.read_csv(PATH_TO_HOLDOUT) +y_holdout = df[TARGET] +X_holdout= df.drop(TARGET, axis=1) + +pipeline.predict(X_holdout) + +# Note: to predict on new data you have on hand +# Map the column names to AML internal names and run prediction +# X_test = X_test.rename(column_mapping, axis=1) +# pipeline.predict(X_test) + +# For more info please check out: +# https://evalml.alteryx.com/en/stable/user_guide/automl.html + """ + if output_file_path: + with open(output_file_path, "w") as text_file: + text_file.write(output_str) + return output_str + + def _make_stacked_ensemble_pipeline( input_pipelines, problem_type, diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 9bc8c82755..fdd06f83e6 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1,3 +1,4 @@ +import os from unittest.mock import patch import numpy as np @@ -45,6 +46,7 @@ _get_preprocessing_components, _make_pipeline_from_multiple_graphs, generate_pipeline_code, + generate_pipeline_example, get_estimators, is_classification, is_regression, @@ -827,6 +829,28 @@ def __init__(self, random_arg=False, random_seed=0): assert pipeline == expected_code +def test_generate_pipeline_example(tmpdir, breast_cancer_local): + path = os.path.join(str(tmpdir), "train.csv") + X, y = breast_cancer_local + + from evalml import AutoMLSearch + + aml = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") + aml.search() + binary_pipeline = aml.best_pipeline + # binary_pipeline = BinaryClassificationPipeline( + # ["Imputer", "Random Forest Classifier"], + # ) + + X["target"] = y + X.to_csv(path) + + pipeline_example = generate_pipeline_example(binary_pipeline, path, path, "target") + print(pipeline_example) + exec(pipeline_example) + assert pipeline_example + + def test_rows_of_interest_errors(X_y_binary): pipeline = BinaryClassificationPipeline( component_graph=["Logistic Regression Classifier"], From b3c3315cfe164ae2d7c7064f3af30d821f800add Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 21 Feb 2023 15:36:28 -0500 Subject: [PATCH 2/5] Cover all problem_types --- evalml/pipelines/utils.py | 19 +++++- .../pipeline_tests/test_pipeline_utils.py | 63 ++++++++++++++++--- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index bfdbffb0dd..9a59424277 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -690,7 +690,9 @@ def generate_pipeline_example( df = pd.read_csv(PATH_TO_HOLDOUT) y_holdout = df[TARGET] X_holdout= df.drop(TARGET, axis=1) - +""" + if not is_time_series(pipeline.problem_type): + output_str += """ pipeline.predict(X_holdout) # Note: to predict on new data you have on hand @@ -700,7 +702,20 @@ def generate_pipeline_example( # For more info please check out: # https://evalml.alteryx.com/en/stable/user_guide/automl.html - """ +""" + else: + output_str += """ +pipeline.predict(X_holdout, X_train=X_train, y_train=y_train) + +# Note: to predict on new data you have on hand +# Map the column names to AML internal names and run prediction +# X_test = X_test.rename(column_mapping, axis=1) +# pipeline.predict(X_test, X_train=X_train, y_train=y_train) + +# For more info please check out: +# https://evalml.alteryx.com/en/stable/user_guide/automl.html +""" + if output_file_path: with open(output_file_path, "w") as text_file: text_file.write(output_str) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index fdd06f83e6..0d56be6bbe 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -829,23 +829,68 @@ def __init__(self, random_arg=False, random_seed=0): assert pipeline == expected_code -def test_generate_pipeline_example(tmpdir, breast_cancer_local): +@pytest.mark.parametrize( + "automl_type", + [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ], +) +def test_generate_pipeline_example( + automl_type, + tmpdir, + AutoMLTestEnv, + X_y_binary, + X_y_multi, + X_y_regression, + ts_data, +): path = os.path.join(str(tmpdir), "train.csv") - X, y = breast_cancer_local + if automl_type == ProblemTypes.BINARY: + X, y = X_y_binary + elif automl_type == ProblemTypes.MULTICLASS: + X, y = X_y_multi + elif automl_type == ProblemTypes.REGRESSION: + X, y = X_y_regression + elif ( + automl_type == ProblemTypes.TIME_SERIES_MULTICLASS + or automl_type == ProblemTypes.TIME_SERIES_BINARY + ): + X, _, y = ts_data(problem_type=automl_type) + else: + X, _, y = ts_data(problem_type=automl_type) from evalml import AutoMLSearch - aml = AutoMLSearch(X_train=X, y_train=y, problem_type="binary") - aml.search() - binary_pipeline = aml.best_pipeline - # binary_pipeline = BinaryClassificationPipeline( - # ["Imputer", "Random Forest Classifier"], - # ) + aml = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=automl_type, + optimize_thresholds=False, + max_time=1, + max_iterations=5, + problem_configuration={ + "time_index": "date", + "gap": 1, + "max_delay": 1, + "forecast_horizon": 3, + } + if is_time_series(automl_type) + else None, + ) + env = AutoMLTestEnv(automl_type) + with env.test_context(score_return_value={aml.objective.name: 1.0}): + aml.search() + pipeline = aml.best_pipeline X["target"] = y X.to_csv(path) - pipeline_example = generate_pipeline_example(binary_pipeline, path, path, "target") + pipeline_example = generate_pipeline_example(pipeline, path, path, "target") print(pipeline_example) exec(pipeline_example) assert pipeline_example From 574156a38a528d37c0cb46c7421f58f3a2deacbe Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 21 Feb 2023 15:37:42 -0500 Subject: [PATCH 3/5] RL --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 47478f4b08..9b155759dd 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Implement ``generate_pipeline_example`` :pr:`4023` * Fixes * Changes * Increase min catboost to 1.1.1 and xgboost to 1.7.0 to add nullable type support for those estimators :pr:`3996` From a0b3d75ca56bb2bef8ade0c991a0c8fd5aa60146 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Wed, 22 Feb 2023 10:38:57 -0500 Subject: [PATCH 4/5] Assert file exists --- evalml/tests/pipeline_tests/test_pipeline_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 0d56be6bbe..4a2446f5fd 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -889,11 +889,16 @@ def test_generate_pipeline_example( X["target"] = y X.to_csv(path) - - pipeline_example = generate_pipeline_example(pipeline, path, path, "target") - print(pipeline_example) + output_path = os.path.join(str(tmpdir), "example.py") + pipeline_example = generate_pipeline_example( + pipeline=pipeline, + path_to_train=path, + path_to_holdout=path, + target="target", + output_file_path=output_path, + ) exec(pipeline_example) - assert pipeline_example + assert os.path.exists(output_path) def test_rows_of_interest_errors(X_y_binary): From 6523ce7935512516badc49cbda6c9df430f3df6f Mon Sep 17 00:00:00 2001 From: Jeremy Date: Fri, 24 Feb 2023 11:05:31 -0500 Subject: [PATCH 5/5] Add asserts --- evalml/pipelines/utils.py | 8 ++++---- evalml/tests/pipeline_tests/test_pipeline_utils.py | 11 +++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 3f6cf1fb34..a475c14ea6 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -703,8 +703,8 @@ def generate_pipeline_example( output_str += """ pipeline.predict(X_holdout) -# Note: to predict on new data you have on hand -# Map the column names to AML internal names and run prediction +# Note: if you have a column mapping, to predict on new data you have on hand +# Map the column names and run prediction # X_test = X_test.rename(column_mapping, axis=1) # pipeline.predict(X_test) @@ -715,8 +715,8 @@ def generate_pipeline_example( output_str += """ pipeline.predict(X_holdout, X_train=X_train, y_train=y_train) -# Note: to predict on new data you have on hand -# Map the column names to AML internal names and run prediction +# Note: if you have a column mapping, to predict on new data you have on hand +# Map the column names and run prediction # X_test = X_test.rename(column_mapping, axis=1) # pipeline.predict(X_test, X_train=X_train, y_train=y_train) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index fa4d435365..652c7fbb18 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -904,6 +904,17 @@ def test_generate_pipeline_example( target="target", output_file_path=output_path, ) + assert f'PATH_TO_TRAIN = "{path}"' in pipeline_example + assert f'PATH_TO_HOLDOUT = "{path}"' in pipeline_example + assert 'TARGET = "target"' in pipeline_example + assert 'column_mapping = ""' in pipeline_example + assert generate_pipeline_code(pipeline) in pipeline_example + + if is_time_series(automl_type): + assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example + else: + assert "predict(X_test)" in pipeline_example + exec(pipeline_example) assert os.path.exists(output_path)