Skip to content

Commit

Permalink
Implement generate_pipeline_example (#4023)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremyliweishih committed Feb 24, 2023
1 parent 5ab965f commit b824d26
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 1 deletion.
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005`
* Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005`
* Implement ``generate_pipeline_example`` :pr:`4023`
* Add new downcast utils for component-specific nullable type handling and begin implementation on objective and component base classes :pr:`4024`
* Fixes
* Changes
Expand Down
84 changes: 84 additions & 0 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,90 @@ def generate_pipeline_code(element):
return pipeline_code


def generate_pipeline_example(
pipeline,
path_to_train,
path_to_holdout,
target,
path_to_mapping="",
output_file_path=None,
):
"""Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline.
Args:
pipeline (pipeline instance): The instance of the pipeline to generate string Python code.
path_to_train (str): path to training data.
path_to_holdout (str): path to holdout data.
target (str): target variable.
path_to_mapping (str): path to mapping json
output_file_path (str): path to output python file.
Returns:
str: String representation of Python code that can be run separately in order to recreate the pipeline instance.
Does not include code for custom component implementation.
"""
output_str = f"""
import evalml
import woodwork
import pandas as pd
PATH_TO_TRAIN = "{path_to_train}"
PATH_TO_HOLDOUT = "{path_to_holdout}"
TARGET = "{target}"
column_mapping = "{path_to_mapping}"
# This is the machine learning pipeline you have exported.
# By running this code you will fit the pipeline on the files provided
# and you can then use this pipeline for prediction and model understanding.
{generate_pipeline_code(pipeline)}
print(pipeline.name)
print(pipeline.parameters)
pipeline.describe()
df = pd.read_csv(PATH_TO_TRAIN)
y_train = df[TARGET]
X_train = df.drop(TARGET, axis=1)
pipeline.fit(X_train, y_train)
# You can now generate predictions as well as run model understanding.
df = pd.read_csv(PATH_TO_HOLDOUT)
y_holdout = df[TARGET]
X_holdout= df.drop(TARGET, axis=1)
"""
if not is_time_series(pipeline.problem_type):
output_str += """
pipeline.predict(X_holdout)
# Note: if you have a column mapping, to predict on new data you have on hand
# Map the column names and run prediction
# X_test = X_test.rename(column_mapping, axis=1)
# pipeline.predict(X_test)
# For more info please check out:
# https://evalml.alteryx.com/en/stable/user_guide/automl.html
"""
else:
output_str += """
pipeline.predict(X_holdout, X_train=X_train, y_train=y_train)
# Note: if you have a column mapping, to predict on new data you have on hand
# Map the column names and run prediction
# X_test = X_test.rename(column_mapping, axis=1)
# pipeline.predict(X_test, X_train=X_train, y_train=y_train)
# For more info please check out:
# https://evalml.alteryx.com/en/stable/user_guide/automl.html
"""

if output_file_path:
with open(output_file_path, "w") as text_file:
text_file.write(output_str)
return output_str


def _make_stacked_ensemble_pipeline(
input_pipelines,
problem_type,
Expand Down
85 changes: 85 additions & 0 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from unittest.mock import patch

import black
Expand Down Expand Up @@ -46,6 +47,7 @@
_get_preprocessing_components,
_make_pipeline_from_multiple_graphs,
generate_pipeline_code,
generate_pipeline_example,
get_estimators,
is_classification,
is_regression,
Expand Down Expand Up @@ -834,6 +836,89 @@ def __init__(self, random_arg=False, random_seed=0):
assert pipeline == expected_code


@pytest.mark.parametrize(
"automl_type",
[
ProblemTypes.BINARY,
ProblemTypes.MULTICLASS,
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.TIME_SERIES_MULTICLASS,
ProblemTypes.TIME_SERIES_BINARY,
],
)
def test_generate_pipeline_example(
automl_type,
tmpdir,
AutoMLTestEnv,
X_y_binary,
X_y_multi,
X_y_regression,
ts_data,
):
path = os.path.join(str(tmpdir), "train.csv")
if automl_type == ProblemTypes.BINARY:
X, y = X_y_binary
elif automl_type == ProblemTypes.MULTICLASS:
X, y = X_y_multi
elif automl_type == ProblemTypes.REGRESSION:
X, y = X_y_regression
elif (
automl_type == ProblemTypes.TIME_SERIES_MULTICLASS
or automl_type == ProblemTypes.TIME_SERIES_BINARY
):
X, _, y = ts_data(problem_type=automl_type)
else:
X, _, y = ts_data(problem_type=automl_type)

from evalml import AutoMLSearch

aml = AutoMLSearch(
X_train=X,
y_train=y,
problem_type=automl_type,
optimize_thresholds=False,
max_time=1,
max_iterations=5,
problem_configuration={
"time_index": "date",
"gap": 1,
"max_delay": 1,
"forecast_horizon": 3,
}
if is_time_series(automl_type)
else None,
)
env = AutoMLTestEnv(automl_type)
with env.test_context(score_return_value={aml.objective.name: 1.0}):
aml.search()
pipeline = aml.best_pipeline

X["target"] = y
X.to_csv(path)
output_path = os.path.join(str(tmpdir), "example.py")
pipeline_example = generate_pipeline_example(
pipeline=pipeline,
path_to_train=path,
path_to_holdout=path,
target="target",
output_file_path=output_path,
)
assert f'PATH_TO_TRAIN = "{path}"' in pipeline_example
assert f'PATH_TO_HOLDOUT = "{path}"' in pipeline_example
assert 'TARGET = "target"' in pipeline_example
assert 'column_mapping = ""' in pipeline_example
assert generate_pipeline_code(pipeline) in pipeline_example

if is_time_series(automl_type):
assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example
else:
assert "predict(X_test)" in pipeline_example

exec(pipeline_example)
assert os.path.exists(output_path)


def test_rows_of_interest_errors(X_y_binary):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"],
Expand Down

0 comments on commit b824d26

Please sign in to comment.