Copyright (c) Microsoft Corporation. All rights reserved.  
Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/NotebookVM/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.png)

# Azure Machine Learning Pipeline for Data Prep

### Azure Machine Learning and Pipeline SDK-specific Imports

In [1]:
import azureml.core
import azureml.dataprep
from azureml.core import Workspace, Experiment, Datastore, Dataset
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.widgets import RunDetails
from azureml.data import TabularDataset
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.83


### Initialize Workspace and Retrieve a Compute Target

In [2]:
ws = Workspace.from_config()
print("== Workspace:")
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# Default datastore (Azure blob storage)
# def_blob_store = ws.get_default_datastore()
blob_store = Datastore(ws, "workspaceblobstore")
print("== Datastore: {}".format(blob_store.name))

# list compute targets
print("== Compute targets:")
for ct in ws.compute_targets:
    print("  " + ct)
    
# Retrieve a compute target    
from azureml.core.compute_target import ComputeTargetException
aml_compute_target = "agd-training-cpu"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("== AML compute target attached: " + aml_compute_target)
except ComputeTargetException:
    print("== AML compute target not found: " + aml_compute_target)

== Workspace:
agd-mlws
azure-ml-workshop
westus2
c5ec24ce-9c5f-4da2-bf12-9ca8e9758d60
== Datastore: workspaceblobstore
== Compute targets:
  agd-inference
  agd-inference-v
  agd-training-gpu
  agd-training-cpu
== AML compute target attached: agd-training-cpu


### Create Compute Configuration

This step uses a docker image, use a [**RunConfiguration**](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py) to specify these requirements and use when creating the PythonScriptStep. 

In [3]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.conda_dependencies import CondaDependencies

# create a new runconfig object
run_config = RunConfiguration()

# enable Docker 
run_config.environment.docker.enabled = True

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# specify dependencies
#run_config.environment.python.conda_dependencies = CondaDependencies.create(
#    conda_packages=['pandas'],
#    pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]', 'azureml-train-automl'], 
#    pin_sdk_version=False)
run_config.environment.python.conda_dependencies = CondaDependencies(
    conda_dependencies_file_path='data-prep-pipeline.yml')

#
print("== Run Configuration created")

== Run Configuration created


## Building Pipeline Steps with Inputs and Outputs
As mentioned earlier, a step in the pipeline can take data as input. This data can be a data source that lives in one of the accessible data locations, or intermediate data produced by a previous step in the pipeline.

### Pipelines steps
Machine learning pipelines can have many steps and these steps could use or reuse datasources and intermediate data. Here's how we construct such a pipeline:

In [4]:
# The best practice is to use separate folders for scripts and its dependent files
# for each step and specify that folder as the source_directory for the step.
# This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted).
# Since changes in any files in the source_directory would trigger a re-upload of the snapshot, this helps
# keep the reuse of the step when there are no changes in the source_directory of the step.
source_directory_featurization = 'src/featurization'

# Input
clustering_ds = Dataset.get_by_name(ws,"clustering-training")

# Output
clustering_featurized_ds = PipelineData("output", datastore=blob_store).as_dataset()
clustering_featurized_ds.register(name="clustering-training-featurized", create_new_version=True)

# Featurize date/hour columns
featurization_step = PythonScriptStep(
    script_name="featurize.py", 
    arguments=["--date_column","date",
               "--hour_column","he",
               "--datetime_column_name","DATETIME",
               "--output", clustering_featurized_ds],
    inputs=[clustering_ds.as_named_input("input")],
    outputs=[clustering_featurized_ds],
    compute_target=aml_compute,
    source_directory=source_directory_featurization,
    runconfig=run_config
)
print("== PythonScriptStep featurization_step created")

== PythonScriptStep featurization_step created


### Build the pipeline and submit an Experiment run

In [5]:
pipeline = Pipeline(workspace=ws, steps=[featurization_step])
print ("== Pipeline is built")

== Pipeline is built


In [6]:
pipeline_run = Experiment(ws, 'use-case-2-data-prep').submit(pipeline)
print("== Pipeline is submitted for execution")

Created step featurize.py [998da97b][41ef6415-e70e-424d-bd82-3f0897d4a9ff], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 22f3dd5c-6b24-4a7a-b9cf-4eb847a43505
Link to Azure Machine Learning studio: https://ml.azure.com/experiments/use-case-2-data-prep/runs/22f3dd5c-6b24-4a7a-b9cf-4eb847a43505?wsid=/subscriptions/c5ec24ce-9c5f-4da2-bf12-9ca8e9758d60/resourcegroups/azure-ml-workshop/workspaces/agd-mlws
== Pipeline is submitted for execution


In [7]:
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

### Wait for pipeline run to complete

In [8]:
pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: 22f3dd5c-6b24-4a7a-b9cf-4eb847a43505
Link to Portal: https://ml.azure.com/experiments/use-case-2-data-prep/runs/22f3dd5c-6b24-4a7a-b9cf-4eb847a43505?wsid=/subscriptions/c5ec24ce-9c5f-4da2-bf12-9ca8e9758d60/resourcegroups/azure-ml-workshop/workspaces/agd-mlws
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: b9c446b8-43b6-4f93-b342-83b501dc0ea3
Link to Portal: https://ml.azure.com/experiments/use-case-2-data-prep/runs/b9c446b8-43b6-4f93-b342-83b501dc0ea3?wsid=/subscriptions/c5ec24ce-9c5f-4da2-bf12-9ca8e9758d60/resourcegroups/azure-ml-workshop/workspaces/agd-mlws
StepRun( featurize.py ) Status: NotStarted
StepRun( featurize.py ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_f91aff41b10e6bc3751c8c01038f29ba5b6ce43fc32986864a3653370426d186_d.txt
2020-04-14T00:26:34Z Starting output-watcher...
2020-04-14T00:26:34Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling f

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with DatasetExecutionError: (Reading 30809 bytes from azure blob storage timed out after 38814.5256 ms.)|session_id=0f4fe872-f748-4418-88a0-e1b155609845",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "DatasetExecutionError",
            "message": "(Reading 30809 bytes from azure blob storage timed out after 38814.5256 ms.)|session_id=0f4fe872-f748-4418-88a0-e1b155609845",
            "stackTrace": "  File \"/mnt/batch/tasks/shared/LS_root/jobs/agd-mlws/azureml/b9c446b8-43b6-4f93-b342-83b501dc0ea3/mounts/workspaceblobstore/azureml/b9c446b8-43b6-4f93-b342-83b501dc0ea3/azureml-setup/context_manager_injector.py\", line 127, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"featurize.py\", line 31, in <module>\n    input_df = input_ds.to_pandas_dataframe()\n  File \"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/site-packages/azureml/data/_loggerfactory.py\", line 106, in wrapper\n    return func(*args, **kwargs)\n  File \"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/site-packages/azureml/data/tabular_dataset.py\", line 166, in to_pandas_dataframe\n    df = _try_execute(lambda: dataflow.to_pandas_dataframe(on_error=on_error,\n  File \"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/site-packages/azureml/data/dataset_error_handling.py\", line 85, in _try_execute\n    raise DatasetExecutionError(str(e))\n"
        }
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with DatasetExecutionError: (Reading 30809 bytes from azure blob storage timed out after 38814.5256 ms.)|session_id=0f4fe872-f748-4418-88a0-e1b155609845\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"DatasetExecutionError\",\n            \"message\": \"(Reading 30809 bytes from azure blob storage timed out after 38814.5256 ms.)|session_id=0f4fe872-f748-4418-88a0-e1b155609845\",\n            \"stackTrace\": \"  File \\\"/mnt/batch/tasks/shared/LS_root/jobs/agd-mlws/azureml/b9c446b8-43b6-4f93-b342-83b501dc0ea3/mounts/workspaceblobstore/azureml/b9c446b8-43b6-4f93-b342-83b501dc0ea3/azureml-setup/context_manager_injector.py\\\", line 127, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"featurize.py\\\", line 31, in <module>\\n    input_df = input_ds.to_pandas_dataframe()\\n  File \\\"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/site-packages/azureml/data/_loggerfactory.py\\\", line 106, in wrapper\\n    return func(*args, **kwargs)\\n  File \\\"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/site-packages/azureml/data/tabular_dataset.py\\\", line 166, in to_pandas_dataframe\\n    df = _try_execute(lambda: dataflow.to_pandas_dataframe(on_error=on_error,\\n  File \\\"/azureml-envs/azureml_ed39ba8e8af4352edb337d7a448246d0/lib/python3.7/site-packages/azureml/data/dataset_error_handling.py\\\", line 85, in _try_execute\\n    raise DatasetExecutionError(str(e))\\n\"\n        }\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

### See Outputs

See where outputs of each pipeline step are located on your datastore.

***Wait for pipeline run to complete, to make sure all the outputs are ready***

In [None]:
# Get Steps
for step in pipeline_run.get_steps():
    print("== Outputs of step " + step.name)
    
    # Get a dictionary of StepRunOutputs with the output name as the key 
    output_dict = step.get_outputs()
    
    for name, output in output_dict.items():
        output_reference = output.get_port_data_reference() # Get output port data reference
        print("\tname: " + name)
        print("\tdatastore: " + output_reference.datastore_name)
        print("\tpath on datastore: " + output_reference.path_on_datastore)