Implement generate_pipeline_example (#4023)

alteryx · Feb 24, 2023 · b824d26 · b824d26
1 parent 5ab965f
commit b824d26
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 1 deletion.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,7 +2,8 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
-      * Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005`
+        * Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005`
+        * Implement ``generate_pipeline_example`` :pr:`4023`
         * Add new downcast utils for component-specific nullable type handling and begin implementation on objective and component base classes :pr:`4024`
     * Fixes
     * Changes

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -646,6 +646,90 @@ def generate_pipeline_code(element):
     return pipeline_code
 
 
+def generate_pipeline_example(
+    pipeline,
+    path_to_train,
+    path_to_holdout,
+    target,
+    path_to_mapping="",
+    output_file_path=None,
+):
+    """Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline.
+
+    Args:
+        pipeline (pipeline instance): The instance of the pipeline to generate string Python code.
+        path_to_train (str): path to training data.
+        path_to_holdout (str): path to holdout data.
+        target (str): target variable.
+        path_to_mapping (str): path to mapping json
+        output_file_path (str): path to output python file.
+
+    Returns:
+        str: String representation of Python code that can be run separately in order to recreate the pipeline instance.
+        Does not include code for custom component implementation.
+
+    """
+    output_str = f"""
+import evalml
+import woodwork
+import pandas as pd
+
+PATH_TO_TRAIN = "{path_to_train}"
+PATH_TO_HOLDOUT = "{path_to_holdout}"
+TARGET = "{target}"
+column_mapping = "{path_to_mapping}"
+
+# This is the machine learning pipeline you have exported.
+# By running this code you will fit the pipeline on the files provided
+# and you can then use this pipeline for prediction and model understanding.
+{generate_pipeline_code(pipeline)}
+
+print(pipeline.name)
+print(pipeline.parameters)
+pipeline.describe()
+
+df = pd.read_csv(PATH_TO_TRAIN)
+y_train = df[TARGET]
+X_train = df.drop(TARGET, axis=1)
+
+pipeline.fit(X_train, y_train)
+
+# You can now generate predictions as well as run model understanding.
+df = pd.read_csv(PATH_TO_HOLDOUT)
+y_holdout = df[TARGET]
+X_holdout= df.drop(TARGET, axis=1)
+"""
+    if not is_time_series(pipeline.problem_type):
+        output_str += """
+pipeline.predict(X_holdout)
+
+# Note: if you have a column mapping, to predict on new data you have on hand
+# Map the column names and run prediction
+# X_test = X_test.rename(column_mapping, axis=1)
+# pipeline.predict(X_test)
+
+# For more info please check out:
+# https://evalml.alteryx.com/en/stable/user_guide/automl.html
+"""
+    else:
+        output_str += """
+pipeline.predict(X_holdout, X_train=X_train, y_train=y_train)
+
+# Note: if you have a column mapping, to predict on new data you have on hand
+# Map the column names and run prediction
+# X_test = X_test.rename(column_mapping, axis=1)
+# pipeline.predict(X_test, X_train=X_train, y_train=y_train)
+
+# For more info please check out:
+# https://evalml.alteryx.com/en/stable/user_guide/automl.html
+"""
+
+    if output_file_path:
+        with open(output_file_path, "w") as text_file:
+            text_file.write(output_str)
+    return output_str
+
+
 def _make_stacked_ensemble_pipeline(
     input_pipelines,
     problem_type,

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -1,3 +1,4 @@
+import os
 from unittest.mock import patch
 
 import black
@@ -46,6 +47,7 @@
     _get_preprocessing_components,
     _make_pipeline_from_multiple_graphs,
     generate_pipeline_code,
+    generate_pipeline_example,
     get_estimators,
     is_classification,
     is_regression,
@@ -834,6 +836,89 @@ def __init__(self, random_arg=False, random_seed=0):
     assert pipeline == expected_code
 
 
+@pytest.mark.parametrize(
+    "automl_type",
+    [
+        ProblemTypes.BINARY,
+        ProblemTypes.MULTICLASS,
+        ProblemTypes.REGRESSION,
+        ProblemTypes.TIME_SERIES_REGRESSION,
+        ProblemTypes.TIME_SERIES_MULTICLASS,
+        ProblemTypes.TIME_SERIES_BINARY,
+    ],
+)
+def test_generate_pipeline_example(
+    automl_type,
+    tmpdir,
+    AutoMLTestEnv,
+    X_y_binary,
+    X_y_multi,
+    X_y_regression,
+    ts_data,
+):
+    path = os.path.join(str(tmpdir), "train.csv")
+    if automl_type == ProblemTypes.BINARY:
+        X, y = X_y_binary
+    elif automl_type == ProblemTypes.MULTICLASS:
+        X, y = X_y_multi
+    elif automl_type == ProblemTypes.REGRESSION:
+        X, y = X_y_regression
+    elif (
+        automl_type == ProblemTypes.TIME_SERIES_MULTICLASS
+        or automl_type == ProblemTypes.TIME_SERIES_BINARY
+    ):
+        X, _, y = ts_data(problem_type=automl_type)
+    else:
+        X, _, y = ts_data(problem_type=automl_type)
+
+    from evalml import AutoMLSearch
+
+    aml = AutoMLSearch(
+        X_train=X,
+        y_train=y,
+        problem_type=automl_type,
+        optimize_thresholds=False,
+        max_time=1,
+        max_iterations=5,
+        problem_configuration={
+            "time_index": "date",
+            "gap": 1,
+            "max_delay": 1,
+            "forecast_horizon": 3,
+        }
+        if is_time_series(automl_type)
+        else None,
+    )
+    env = AutoMLTestEnv(automl_type)
+    with env.test_context(score_return_value={aml.objective.name: 1.0}):
+        aml.search()
+    pipeline = aml.best_pipeline
+
+    X["target"] = y
+    X.to_csv(path)
+    output_path = os.path.join(str(tmpdir), "example.py")
+    pipeline_example = generate_pipeline_example(
+        pipeline=pipeline,
+        path_to_train=path,
+        path_to_holdout=path,
+        target="target",
+        output_file_path=output_path,
+    )
+    assert f'PATH_TO_TRAIN = "{path}"' in pipeline_example
+    assert f'PATH_TO_HOLDOUT = "{path}"' in pipeline_example
+    assert 'TARGET = "target"' in pipeline_example
+    assert 'column_mapping = ""' in pipeline_example
+    assert generate_pipeline_code(pipeline) in pipeline_example
+
+    if is_time_series(automl_type):
+        assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example
+    else:
+        assert "predict(X_test)" in pipeline_example
+
+    exec(pipeline_example)
+    assert os.path.exists(output_path)
+
+
 def test_rows_of_interest_errors(X_y_binary):
     pipeline = BinaryClassificationPipeline(
         component_graph=["Logistic Regression Classifier"],