Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement generate_pipeline_example #4023

Merged
merged 11 commits into from
Feb 24, 2023
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005`
* Move black to regular dependency and use it for ``generate_pipeline_code`` :pr:`4005`
* Implement ``generate_pipeline_example`` :pr:`4023`
* Add new downcast utils for component-specific nullable type handling and begin implementation on objective and component base classes :pr:`4024`
* Fixes
* Changes
Expand Down
84 changes: 84 additions & 0 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,90 @@ def generate_pipeline_code(element):
return pipeline_code


def generate_pipeline_example(
pipeline,
path_to_train,
path_to_holdout,
target,
path_to_mapping="",
output_file_path=None,
):
"""Creates and returns a string that contains the Python imports and code required for running the EvalML pipeline.

Args:
pipeline (pipeline instance): The instance of the pipeline to generate string Python code.
path_to_train (str): path to training data.
path_to_holdout (str): path to holdout data.
target (str): target variable.
path_to_mapping (str): path to mapping json
output_file_path (str): path to output python file.

Returns:
str: String representation of Python code that can be run separately in order to recreate the pipeline instance.
Does not include code for custom component implementation.

"""
output_str = f"""
import evalml
import woodwork
import pandas as pd

PATH_TO_TRAIN = "{path_to_train}"
PATH_TO_HOLDOUT = "{path_to_holdout}"
TARGET = "{target}"
column_mapping = "{path_to_mapping}"

# This is the machine learning pipeline you have exported.
# By running this code you will fit the pipeline on the files provided
# and you can then use this pipeline for prediction and model understanding.
{generate_pipeline_code(pipeline)}

print(pipeline.name)
print(pipeline.parameters)
pipeline.describe()

df = pd.read_csv(PATH_TO_TRAIN)
y_train = df[TARGET]
X_train = df.drop(TARGET, axis=1)

pipeline.fit(X_train, y_train)

# You can now generate predictions as well as run model understanding.
df = pd.read_csv(PATH_TO_HOLDOUT)
y_holdout = df[TARGET]
X_holdout= df.drop(TARGET, axis=1)
"""
if not is_time_series(pipeline.problem_type):
output_str += """
pipeline.predict(X_holdout)

# Note: if you have a column mapping, to predict on new data you have on hand
# Map the column names and run prediction
# X_test = X_test.rename(column_mapping, axis=1)
# pipeline.predict(X_test)

# For more info please check out:
# https://evalml.alteryx.com/en/stable/user_guide/automl.html
"""
else:
output_str += """
pipeline.predict(X_holdout, X_train=X_train, y_train=y_train)

# Note: if you have a column mapping, to predict on new data you have on hand
# Map the column names and run prediction
# X_test = X_test.rename(column_mapping, axis=1)
# pipeline.predict(X_test, X_train=X_train, y_train=y_train)

# For more info please check out:
# https://evalml.alteryx.com/en/stable/user_guide/automl.html
"""

if output_file_path:
with open(output_file_path, "w") as text_file:
text_file.write(output_str)
return output_str


def _make_stacked_ensemble_pipeline(
input_pipelines,
problem_type,
Expand Down
85 changes: 85 additions & 0 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from unittest.mock import patch

import black
Expand Down Expand Up @@ -46,6 +47,7 @@
_get_preprocessing_components,
_make_pipeline_from_multiple_graphs,
generate_pipeline_code,
generate_pipeline_example,
get_estimators,
is_classification,
is_regression,
Expand Down Expand Up @@ -834,6 +836,89 @@ def __init__(self, random_arg=False, random_seed=0):
assert pipeline == expected_code


@pytest.mark.parametrize(
"automl_type",
[
ProblemTypes.BINARY,
ProblemTypes.MULTICLASS,
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.TIME_SERIES_MULTICLASS,
ProblemTypes.TIME_SERIES_BINARY,
],
)
def test_generate_pipeline_example(
automl_type,
tmpdir,
AutoMLTestEnv,
X_y_binary,
X_y_multi,
X_y_regression,
ts_data,
):
path = os.path.join(str(tmpdir), "train.csv")
if automl_type == ProblemTypes.BINARY:
X, y = X_y_binary
elif automl_type == ProblemTypes.MULTICLASS:
X, y = X_y_multi
elif automl_type == ProblemTypes.REGRESSION:
X, y = X_y_regression
elif (
automl_type == ProblemTypes.TIME_SERIES_MULTICLASS
or automl_type == ProblemTypes.TIME_SERIES_BINARY
):
X, _, y = ts_data(problem_type=automl_type)
else:
X, _, y = ts_data(problem_type=automl_type)

from evalml import AutoMLSearch

aml = AutoMLSearch(
X_train=X,
y_train=y,
problem_type=automl_type,
optimize_thresholds=False,
max_time=1,
max_iterations=5,
problem_configuration={
"time_index": "date",
"gap": 1,
"max_delay": 1,
"forecast_horizon": 3,
}
if is_time_series(automl_type)
else None,
)
env = AutoMLTestEnv(automl_type)
with env.test_context(score_return_value={aml.objective.name: 1.0}):
aml.search()
pipeline = aml.best_pipeline

X["target"] = y
X.to_csv(path)
output_path = os.path.join(str(tmpdir), "example.py")
pipeline_example = generate_pipeline_example(
jeremyliweishih marked this conversation as resolved.
Show resolved Hide resolved
pipeline=pipeline,
path_to_train=path,
path_to_holdout=path,
target="target",
output_file_path=output_path,
)
assert f'PATH_TO_TRAIN = "{path}"' in pipeline_example
assert f'PATH_TO_HOLDOUT = "{path}"' in pipeline_example
assert 'TARGET = "target"' in pipeline_example
assert 'column_mapping = ""' in pipeline_example
assert generate_pipeline_code(pipeline) in pipeline_example

if is_time_series(automl_type):
assert "predict(X_test, X_train=X_train, y_train=y_train)" in pipeline_example
else:
assert "predict(X_test)" in pipeline_example

exec(pipeline_example)
assert os.path.exists(output_path)


def test_rows_of_interest_errors(X_y_binary):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"],
Expand Down