diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index ec9887a49a..e9a87e6c84 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,7 +2,8 @@ Release Notes ------------- **Future Releases** * Enhancements - * Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005` + * Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005` + * Implement ``generate_pipeline_example`` :pr:`4023` * Add new downcast utils for component-specific nullable type handling and begin implementation on objective and component base classes :pr:`4024` * Fixes * Changes diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 64716b1b56..a475c14ea6 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -646,6 +646,90 @@ def generate_pipeline_code(element): return pipeline_code +def generate_pipeline_example( + pipeline, + path_to_train, + path_to_holdout, + target, + path_to_mapping="", + output_file_path=None, +): + """Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline. + + Args: + pipeline (pipeline instance): The instance of the pipeline to generate string Python code. + path_to_train (str): path to training data. + path_to_holdout (str): path to holdout data. + target (str): target variable. + path_to_mapping (str): path to mapping json + output_file_path (str): path to output python file. + + Returns: + str: String representation of Python code that can be run separately in order to recreate the pipeline instance. + Does not include code for custom component implementation. + + """ + output_str = f""" +import evalml +import woodwork +import pandas as pd + +PATH_TO_TRAIN = "{path_to_train}" +PATH_TO_HOLDOUT = "{path_to_holdout}" +TARGET = "{target}" +column_mapping = "{path_to_mapping}" + +# This is the machine learning pipeline you have exported. +# By running this code you will fit the pipeline on the files provided +# and you can then use this pipeline for prediction and model understanding. +{generate_pipeline_code(pipeline)} + +print(pipeline.name) +print(pipeline.parameters) +pipeline.describe() + +df = pd.read_csv(PATH_TO_TRAIN) +y_train = df[TARGET] +X_train = df.drop(TARGET, axis=1) + +pipeline.fit(X_train, y_train) + +# You can now generate predictions as well as run model understanding. +df = pd.read_csv(PATH_TO_HOLDOUT) +y_holdout = df[TARGET] +X_holdout= df.drop(TARGET, axis=1) +""" + if not is_time_series(pipeline.problem_type): + output_str += """ +pipeline.predict(X_holdout) + +# Note: if you have a column mapping, to predict on new data you have on hand +# Map the column names and run prediction +# X_test = X_test.rename(column_mapping, axis=1) +# pipeline.predict(X_test) + +# For more info please check out: +# https://evalml.alteryx.com/en/stable/user_guide/automl.html +""" + else: + output_str += """ +pipeline.predict(X_holdout, X_train=X_train, y_train=y_train) + +# Note: if you have a column mapping, to predict on new data you have on hand +# Map the column names and run prediction +# X_test = X_test.rename(column_mapping, axis=1) +# pipeline.predict(X_test, X_train=X_train, y_train=y_train) + +# For more info please check out: +# https://evalml.alteryx.com/en/stable/user_guide/automl.html +""" + + if output_file_path: + with open(output_file_path, "w") as text_file: + text_file.write(output_str) + return output_str + + def _make_stacked_ensemble_pipeline( input_pipelines, problem_type, diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 1449600d7d..652c7fbb18 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1,3 +1,4 @@ +import os from unittest.mock import patch import black @@ -46,6 +47,7 @@ _get_preprocessing_components, _make_pipeline_from_multiple_graphs, generate_pipeline_code, + generate_pipeline_example, get_estimators, is_classification, is_regression, @@ -834,6 +836,89 @@ def __init__(self, random_arg=False, random_seed=0): assert pipeline == expected_code +@pytest.mark.parametrize( + "automl_type", + [ + ProblemTypes.BINARY, + ProblemTypes.MULTICLASS, + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.TIME_SERIES_BINARY, + ], +) +def test_generate_pipeline_example( + automl_type, + tmpdir, + AutoMLTestEnv, + X_y_binary, + X_y_multi, + X_y_regression, + ts_data, +): + path = os.path.join(str(tmpdir), "train.csv") + if automl_type == ProblemTypes.BINARY: + X, y = X_y_binary + elif automl_type == ProblemTypes.MULTICLASS: + X, y = X_y_multi + elif automl_type == ProblemTypes.REGRESSION: + X, y = X_y_regression + elif ( + automl_type == ProblemTypes.TIME_SERIES_MULTICLASS + or automl_type == ProblemTypes.TIME_SERIES_BINARY + ): + X, _, y = ts_data(problem_type=automl_type) + else: + X, _, y = ts_data(problem_type=automl_type) + + from evalml import AutoMLSearch + + aml = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=automl_type, + optimize_thresholds=False, + max_time=1, + max_iterations=5, + problem_configuration={ + "time_index": "date", + "gap": 1, + "max_delay": 1, + "forecast_horizon": 3, + } + if is_time_series(automl_type) + else None, + ) + env = AutoMLTestEnv(automl_type) + with env.test_context(score_return_value={aml.objective.name: 1.0}): + aml.search() + pipeline = aml.best_pipeline + + X["target"] = y + X.to_csv(path) + output_path = os.path.join(str(tmpdir), "example.py") + pipeline_example = generate_pipeline_example( + pipeline=pipeline, + path_to_train=path, + path_to_holdout=path, + target="target", + output_file_path=output_path, + ) + assert f'PATH_TO_TRAIN = "{path}"' in pipeline_example + assert f'PATH_TO_HOLDOUT = "{path}"' in pipeline_example + assert 'TARGET = "target"' in pipeline_example + assert 'column_mapping = ""' in pipeline_example + assert generate_pipeline_code(pipeline) in pipeline_example + + if is_time_series(automl_type): + assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example + else: + assert "predict(X_test)" in pipeline_example + + exec(pipeline_example) + assert os.path.exists(output_path) + + def test_rows_of_interest_errors(X_y_binary): pipeline = BinaryClassificationPipeline( component_graph=["Logistic Regression Classifier"],