Copyright (c) Microsoft Corporation. All rights reserved.  
Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/NotebookVM/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.png)

# Azure Machine Learning Data Prep
In this notebook, we will see how we can do data prep using the azureml.dataprep SDK.

### Azure Machine Learning and Pipeline SDK-specific Imports

In [None]:
# Core ML to create exeperiments and run them
import azureml.core
from azureml.core import Workspace, Experiment, Datastore, Dataset
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.widgets import RunDetails
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep

# AzureML Dataprep SDK
import azureml.dataprep as dprep

# Check core SDK version number
print("AML Core SDK version:", azureml.core.VERSION)

### Getting AML Workspace and Compute

In [None]:
ws = Workspace.from_config()
print("== Workspace:")
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# Default datastore (Azure blob storage)
# def_blob_store = ws.get_default_datastore()
blob_store = Datastore(ws, "workspaceblobstore")
print("== Datastore: {}".format(blob_store.name))

# list compute targets
print("== Compute targets:")
for ct in ws.compute_targets:
    print("  " + ct)
    
# Retrieve a compute target    
from azureml.core.compute_target import ComputeTargetException
aml_compute_target = "agd-training-cpu"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("== AML compute target attached: " + aml_compute_target)
except ComputeTargetException:
    print("== AML compute target not found: " + aml_compute_target)

### Data inputs definitions

In [None]:
# inputs definitions
h_ts_1_dr = DataReference(datastore=blob_store,data_reference_name="h_ts_1",path_on_datastore="datasets/time-series/S_ACTUALS.csv")
h_ts_2_dr = DataReference(datastore=blob_store,data_reference_name="h_ts_2",path_on_datastore="datasets/time-series/W_ACTUALS.csv")
h_ts_3_dr = DataReference(datastore=blob_store,data_reference_name="h_ts_3",path_on_datastore="datasets/time-series/C_LOAD.csv")
d_ts_1_dr = DataReference(datastore=blob_store,data_reference_name="d_ts_1",path_on_datastore="datasets/time-series/X1.csv")
d_ts_2_dr = DataReference(datastore=blob_store,data_reference_name="d_ts_2",path_on_datastore="datasets/time-series/X2.csv")

print("== Datasets defined")

## Data prep using Azure ML Dataprep SDK

In [None]:
# read data as azureml.dataprep.DataFlow objects
h_ts_1_dflow = dprep.read_csv(h_ts_1_dr)
h_ts_2_dflow = dprep.read_csv(h_ts_2_dr)
h_ts_3_dflow = dprep.read_csv(h_ts_3_dr)
d_ts_1_dflow = dprep.read_csv(d_ts_1_dr)
d_ts_2_dflow = dprep.read_csv(d_ts_2_dr)

In [None]:
h_ts_1_dflow.head(6)

In [None]:
h_ts_1_dflow.get_profile()

In [None]:
# discover data types
builder = h_ts_1_dflow.builders.set_column_types()
builder.learn()
builder.conversion_candidates

In [None]:
# resolve MYDATE type ambiguity
builder.conversion_candidates['MYDATE'] = (dprep.FieldType.DATE, ['%m/%d/%Y'])
h_ts_1_dflow = builder.to_dataflow()
h_ts_1_dflow.get_profile()

In [None]:
# instead 'auto_read_file' will do type auto detect (when it can), and other things (auto skip first rows, will find the right header, etc)
h_ts_1_dflow = dprep.auto_read_file(h_ts_1_dr)
h_ts_1_dflow.get_profile()

In [None]:
# =======================
# azureml-dataprep-sdk.py (WIP)
# =======================

h_ts_1_dflow = dprep.auto_read_file(h_ts_1_dr)
h_ts_2_dflow = dprep.auto_read_file(h_ts_2_dr)
h_ts_3_dflow = dprep.auto_read_file(h_ts_3_dr)
d_ts_1_dflow = dprep.auto_read_file(d_ts_1_dr)
d_ts_2_dflow = dprep.auto_read_file(d_ts_2_dr)

# ===========================
# EOF azureml-dataprep-sdk.py (WIP)
# ===========================

In [None]:
# ts_1 and ts_2 need to be pivoted
h_ts_1_dflow.head(6)

In [None]:
# PIVOT data
h_ts_1_pivot_dflow = h_ts_1_dflow.pivot(['NODE_ID'],'MW',
                                        azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
                                        ['MYDATE','HOUR'])
h_ts_1_pivot_dflow.head(2)

In [None]:
# using derive_column_by_example to reformat DATE/HOUR as a DATETIME field
builder = h_ts_1_pivot_dflow.builders.derive_column_by_example(source_columns = ['MYDATE','HOUR'], new_column_name = 'DATETIME')
builder.add_example(source_data = {'MYDATE': '1/1/2012', 'HOUR': 1}, example_value = '01/01/2012 01:00')
builder.add_example(source_data = {'MYDATE': '10/10/2012', 'HOUR': 15}, example_value = '10/10/2012 15:00')
builder.add_example(source_data = {'MYDATE': '1/17/2012', 'HOUR': 12}, example_value = '01/17/2012 12:00')
builder.preview(skip=3000,count=20)

In [None]:
h_ts_1_pivot_dt_dflow = builder.to_dataflow()
h_ts_1_pivot_dt_dflow.head(6)

In [None]:
h_ts_1_pivot_dt_dflow.dtypes

In [None]:
h_ts_1_pivot_dt_dflow = h_ts_1_pivot_dflow.derive_column_by_example(
    source_columns = ['MYDATE','HOUR'],
    new_column_name = 'DATETIME',
    example_data = [({'MYDATE': '1/1/2012', 'HOUR': '1'},    '01/01/2012 01:00'),
                    ({'MYDATE': '10/10/2012', 'HOUR': '15'}, '10/10/2012 15:00'),
                    ({'MYDATE': '1/17/2012', 'HOUR': '12'},  '01/17/2012 12:00')]
    ).drop_columns(['MYDATE','HOUR'])
h_ts_1_pivot_dt_dflow.head(4)

In [None]:
builder = h_ts_1_pivot_dt_dflow.builders.set_column_types()
builder.learn()
builder.conversion_candidates

In [None]:
builder.conversion_candidates['DATETIME'] = (dprep.FieldType.DATE, ['%d/%m/%Y %H:%M'])
h_ts_1_pivot_dt_dflow = builder.to_dataflow()
h_ts_1_pivot_dt_dflow.head(4)

In [None]:
h_ts_1_pivot_dt_dflow.dtypes

In [None]:
h_ts_1_pivot_dt_dflow.get_profile()

In [None]:
# =======================
# azureml-dataprep-sdk.py
# =======================

h_ts_1_dflow = dprep.auto_read_file(h_ts_1_dr)
h_ts_2_dflow = dprep.auto_read_file(h_ts_2_dr)
h_ts_3_dflow = dprep.auto_read_file(h_ts_3_dr)
d_ts_1_dflow = dprep.auto_read_file(d_ts_1_dr)
d_ts_2_dflow = dprep.auto_read_file(d_ts_2_dr)

# Pivot data
h_ts_1_pivot_dflow = h_ts_1_dflow.pivot(['NODE_ID'],'MW',
                                        azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
                                        ['MYDATE','HOUR'])
h_ts_2_pivot_dflow = h_ts_2_dflow.pivot(['NODE_ID'],'MW',
                                        azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
                                        ['MYDATE','HOUR'])

# merge DATE and HOUR and update datatype for DATETIME column
def ts_merge_date_hour_to_datetime(dflow, date_column_name, hour_column_name):
    # merge columns
    dflow = dflow.derive_column_by_example(
        source_columns = [date_column_name,hour_column_name],
        new_column_name = 'DATETIME',
        example_data = [({date_column_name: '1/1/2012', hour_column_name: '1'},    '01/01/2012 01:00'),
                        ({date_column_name: '10/10/2012', hour_column_name: '15'}, '10/10/2012 15:00'),
                        ({date_column_name: '1/17/2012', hour_column_name: '12'},  '01/17/2012 12:00')]
        ).drop_columns([date_column_name,hour_column_name])
    # update data type
    builder = dflow.builders.set_column_types()
    builder.learn()
    builder.conversion_candidates['DATETIME'] = (dprep.FieldType.DATE, ['%d/%m/%Y %H:%M'])
    return builder.to_dataflow()

# generate all DATETIME columns with proper data type
h_ts_1_pivot_dt_dflow = ts_merge_date_hour_to_datetime(h_ts_1_pivot_dflow,'MYDATE','HOUR')
h_ts_2_pivot_dt_dflow = ts_merge_date_hour_to_datetime(h_ts_2_pivot_dflow,'MYDATE','HOUR')
h_ts_3_dt_dflow =       ts_merge_date_hour_to_datetime(h_ts_3_dflow,      'DATE','HE')

# ===========================
# EOF azureml-dataprep-sdk.py (WIP)
# ===========================

In [None]:
# check profile
h_ts_1_pivot_dt_dflow.get_profile()

In [None]:
# check profile
d_ts_1_dflow.get_profile()

In [None]:
# JOINING DATA SET h_ts_j1=h1,h2
h_ts_j1_dflow = dprep.Dataflow.join(
    h_ts_1_pivot_dt_dflow,
    h_ts_2_pivot_dt_dflow,
    join_key_pairs=[('DATETIME', 'DATETIME')],
    left_column_prefix='h1_',
    right_column_prefix='h2_'
).drop_columns('h2_DATETIME').rename_columns({'h1_DATETIME':'DATETIME'})
h_ts_j1_dflow.head(5)

In [None]:
# JOINING DATA SET h_ts_dflow=h_ts_j1,h3
h_ts_dflow = dprep.Dataflow.join(
    h_ts_j1_dflow,
    h_ts_3_dt_dflow,
    join_key_pairs=[('DATETIME', 'DATETIME')],
    left_column_prefix='',
    right_column_prefix='h3_'
).drop_columns('h3_DATETIME')
h_ts_dflow.head(5)

In [None]:
# =======================
# azureml-dataprep-sdk.py (WIP)
# =======================

h_ts_1_dflow = dprep.auto_read_file(h_ts_1_dr)
h_ts_2_dflow = dprep.auto_read_file(h_ts_2_dr)
h_ts_3_dflow = dprep.auto_read_file(h_ts_3_dr)
d_ts_1_dflow = dprep.auto_read_file(d_ts_1_dr)
d_ts_2_dflow = dprep.auto_read_file(d_ts_2_dr)

# Pivot data
h_ts_1_pivot_dflow = h_ts_1_dflow.pivot(['NODE_ID'],'MW',
                                        azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
                                        ['MYDATE','HOUR'])
h_ts_2_pivot_dflow = h_ts_2_dflow.pivot(['NODE_ID'],'MW',
                                        azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
                                        ['MYDATE','HOUR'])

# merge DATE and HOUR and update datatype for DATETIME column
def ts_merge_date_hour_to_datetime(dflow, date_column_name, hour_column_name):
    # merge columns
    dflow = dflow.derive_column_by_example(
        source_columns = [date_column_name,hour_column_name],
        new_column_name = 'DATETIME',
        example_data = [({date_column_name: '1/1/2012', hour_column_name: '1'},    '01/01/2012 01:00'),
                        ({date_column_name: '10/10/2012', hour_column_name: '15'}, '10/10/2012 15:00'),
                        ({date_column_name: '1/17/2012', hour_column_name: '12'},  '01/17/2012 12:00')]
        ).drop_columns([date_column_name,hour_column_name])
    # update data type
    builder = dflow.builders.set_column_types()
    builder.learn()
    builder.conversion_candidates['DATETIME'] = (dprep.FieldType.DATE, ['%d/%m/%Y %H:%M'])
    return builder.to_dataflow()

# generate all DATETIME columns with proper data type
h_ts_1_pivot_dt_dflow = ts_merge_date_hour_to_datetime(h_ts_1_pivot_dflow,'MYDATE','HOUR')
h_ts_2_pivot_dt_dflow = ts_merge_date_hour_to_datetime(h_ts_2_pivot_dflow,'MYDATE','HOUR')
h_ts_3_dt_dflow =       ts_merge_date_hour_to_datetime(h_ts_3_dflow,      'DATE','HE')

# JOINING DATA SET h_ts_dflow=join(h1,h2,h3)
h_ts_dflow = dprep.Dataflow.join(
    dprep.Dataflow.join(h_ts_1_pivot_dt_dflow,
                        h_ts_2_pivot_dt_dflow,
                        join_key_pairs=[('DATETIME', 'DATETIME')],
                        left_column_prefix='h1_',right_column_prefix='h2_'
                       ).drop_columns(['h2_DATETIME']).rename_columns({'h1_DATETIME':'DATETIME'}),
    h_ts_3_dt_dflow,
    join_key_pairs=[('DATETIME', 'DATETIME')],
    left_column_prefix='',
    right_column_prefix='h3_'
).drop_columns('h3_DATETIME')

# JOINING DATA SET d_ts_dflow=join(d1,d2)
d_ts_dflow = dprep.Dataflow.join(
    d_ts_1_dflow,
    d_ts_2_dflow,
    join_key_pairs=[('RDATE', 'RDATE')],
    left_column_prefix='',
    right_column_prefix='r_').drop_columns(['r_RDATE']).rename_columns({'r_X2':'X2'})

# ===========================
# EOF azureml-dataprep-sdk.py (WIP)
# ===========================

In [None]:
# check profile 
h_ts_dflow.get_profile()

In [None]:
# TODO ==> KEEP DATE COLUMN IN H to properly summarize by DATE and not DATETIME in H
# TODO ==> KEEP DATE COLUMN IN H to properly summarize by DATE and not DATETIME in H
# TODO ==> KEEP DATE COLUMN IN H to properly summarize by DATE and not DATETIME in H
# TODO ==> KEEP DATE COLUMN IN H to properly summarize by DATE and not DATETIME in H
# TODO ==> KEEP DATE COLUMN IN H to properly summarize by DATE and not DATETIME in H

In [None]:
# summarize h_ts_dflow to daily
def generate_summary_column(column_name,column_suffix,summary_function):
    return dprep.SummaryColumnsValue(
                column_id=column_name,
                summary_column_name=column_name+'_'+column_suffix,
                summary_function=summary_function)

def generate_summary_columns(dflow):
    summary_columns = []
    for key in h_ts_dflow.get_profile().columns.keys():
        if key != 'DATETIME':
            summary_columns.append(generate_summary_column(key,'MAX',dprep.SummaryFunction.MAX))
            summary_columns.append(generate_summary_column(key,'MIN',dprep.SummaryFunction.MIN))
            summary_columns.append(generate_summary_column(key,'MEAN',dprep.SummaryFunction.MEAN))
            summary_columns.append(generate_summary_column(key,'MEDIAN',dprep.SummaryFunction.MEDIAN))
    return summary_columns

h_ts_summarized_dflow = h_ts_dflow.summarize(
    summary_columns=generate_summary_columns(h_ts_dflow),
    group_by_columns=['DATETIME'])

h_ts_summarized_dflow.head(4)

## FINAL azureml-dataprep-sdk.py Script

In [None]:
# =======================
# azureml-dataprep-sdk.py
# =======================

h_ts_1_dflow = dprep.auto_read_file(h_ts_1_dr)
h_ts_2_dflow = dprep.auto_read_file(h_ts_2_dr)
h_ts_3_dflow = dprep.auto_read_file(h_ts_3_dr)
d_ts_1_dflow = dprep.auto_read_file(d_ts_1_dr)
d_ts_2_dflow = dprep.auto_read_file(d_ts_2_dr)

# Pivot data
h_ts_1_pivot_dflow = h_ts_1_dflow.pivot(['NODE_ID'],'MW',
                                        azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
                                        ['MYDATE','HOUR'])
h_ts_2_pivot_dflow = h_ts_2_dflow.pivot(['NODE_ID'],'MW',
                                        azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
                                        ['MYDATE','HOUR'])

# merge DATE and HOUR and update datatype for DATETIME column
def ts_merge_date_hour_to_datetime(dflow, date_column_name, hour_column_name):
    # merge columns
    dflow = dflow.derive_column_by_example(
        source_columns = [date_column_name,hour_column_name],
        new_column_name = 'DATETIME',
        example_data = [({date_column_name: '1/1/2012', hour_column_name: '1'},    '01/01/2012 01:00'),
                        ({date_column_name: '10/10/2012', hour_column_name: '15'}, '10/10/2012 15:00'),
                        ({date_column_name: '1/17/2012', hour_column_name: '12'},  '01/17/2012 12:00')]
        ).drop_columns([date_column_name,hour_column_name])
    # update data type
    builder = dflow.builders.set_column_types()
    builder.learn()
    builder.conversion_candidates['DATETIME'] = (dprep.FieldType.DATE, ['%d/%m/%Y %H:%M'])
    return builder.to_dataflow()

# generate all DATETIME columns with proper data type
h_ts_1_pivot_dt_dflow = ts_merge_date_hour_to_datetime(h_ts_1_pivot_dflow,'MYDATE','HOUR')
h_ts_2_pivot_dt_dflow = ts_merge_date_hour_to_datetime(h_ts_2_pivot_dflow,'MYDATE','HOUR')
h_ts_3_dt_dflow =       ts_merge_date_hour_to_datetime(h_ts_3_dflow,      'DATE','HE')

# JOINING DATA SET h_ts_dflow=join(h1,h2,h3)
h_ts_dflow = dprep.Dataflow.join(
    dprep.Dataflow.join(h_ts_1_pivot_dt_dflow,
                        h_ts_2_pivot_dt_dflow,
                        join_key_pairs=[('DATETIME', 'DATETIME')],
                        left_column_prefix='h1_',right_column_prefix='h2_'
                       ).drop_columns(['h2_DATETIME']).rename_columns({'h1_DATETIME':'DATETIME'}),
    h_ts_3_dt_dflow,
    join_key_pairs=[('DATETIME', 'DATETIME')],
    left_column_prefix='',
    right_column_prefix='h3_').drop_columns(['h3_DATETIME'])

# JOINING DATA SET d_ts_dflow=join(d1,d2)
d_ts_dflow = dprep.Dataflow.join(
    d_ts_1_dflow,
    d_ts_2_dflow,
    join_key_pairs=[('RDATE', 'RDATE')],
    left_column_prefix='',
    right_column_prefix='r_').drop_columns(['r_RDATE']).rename_columns({'r_X2':'X2'})

# helper: generate summary column
def generate_summary_column(column_name,column_suffix,summary_function):
    return dprep.SummaryColumnsValue(
                column_id=column_name,
                summary_column_name=column_name+'_'+column_suffix,
                summary_function=summary_function)

# helper: generate summary column for a few functions for each column that's not DATETIME
def generate_summary_columns(dflow):
    summary_columns = []
    for key in h_ts_dflow.get_profile().columns.keys():
        if key != 'DATETIME':
            summary_columns.append(generate_summary_column(key,'MAX',dprep.SummaryFunction.MAX))
            summary_columns.append(generate_summary_column(key,'MIN',dprep.SummaryFunction.MIN))
            summary_columns.append(generate_summary_column(key,'MEAN',dprep.SummaryFunction.MEAN))
            summary_columns.append(generate_summary_column(key,'MEDIAN',dprep.SummaryFunction.MEDIAN))
    return summary_columns

# summarize h_ts_dflow to daily
h_ts_summarized_dflow = h_ts_dflow.summarize(
    summary_columns=generate_summary_columns(h_ts_dflow),
    group_by_columns=['DATETIME'])

# join h and d series
training_dflow = dprep.Dataflow.join(
    h_ts_summarized_dflow,
    d_ts_dflow,
    join_key_pairs=[('DATETIME', 'RDATE')],
    left_column_prefix='',
    right_column_prefix='r_').drop_columns(['r_RDATE']).rename_columns({'r_X1':'X1','r_X2':'X2'})

# ===========================
# EOF azureml-dataprep-sdk.py
# ===========================

In [None]:
# test output
training_dflow.write_to_csv(directory_path='training_dflow.csv').run_local()

In [None]:
# check profile
training_dflow.get_profile()

## Creating RunConfiguration

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.conda_dependencies import CondaDependencies

# create a new runconfig object
run_config = RunConfiguration()

# enable Docker 
run_config.environment.docker.enabled = True

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# specify dependencies
#run_config.environment.python.conda_dependencies = CondaDependencies.create(
#    conda_packages=['pandas'],
#    pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]', 'azureml-train-automl'], 
#    pin_sdk_version=False)
run_config.environment.python.conda_dependencies = CondaDependencies(
    conda_dependencies_file_path='data-prep-pipeline.yml')

#
print("== Run Configuration created")

## Creating Pipeline

In [None]:
# The best practice is to use separate folders for scripts and its dependent files
# for each step and specify that folder as the source_directory for the step.
# This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted).
# Since changes in any files in the source_directory would trigger a re-upload of the snapshot, this helps
# keep the reuse of the step when there are no changes in the source_directory of the step.
source_directory_dataprep = 'src/azureml-dataprep-sdk'

# inputs
h_ts_1_dr = DataReference(datastore=blob_store,data_reference_name="h_ts_1",path_on_datastore="datasets/time-series/S_ACTUALS.csv")
h_ts_2_dr = DataReference(datastore=blob_store,data_reference_name="h_ts_2",path_on_datastore="datasets/time-series/W_ACTUALS.csv")
h_ts_3_dr = DataReference(datastore=blob_store,data_reference_name="h_ts_3",path_on_datastore="datasets/time-series/C_LOAD.csv")
d_ts_1_dr = DataReference(datastore=blob_store,data_reference_name="d_ts_1",path_on_datastore="datasets/time-series/X1.csv")
d_ts_2_dr = DataReference(datastore=blob_store,data_reference_name="d_ts_2",path_on_datastore="datasets/time-series/X2.csv")

# output
d_use_case_1_dataprep_sdk_pd = PipelineData("d_use_case_1_dataprep_sdk",datastore=blob_store)

# Step
use_case_1_dataprep_sdk_step = PythonScriptStep(
    script_name="azureml-dataprep-sdk.py",
    arguments=[ "--h_ts_1", h_ts_1_dr, "--h_ts_2", h_ts_2_dr, "--h_ts_3", h_ts_3_dr,
                "--d_ts_1", d_ts_1_dr, "--d_ts_2", d_ts_2_dr,
                "--output", d_use_case_1_dataprep_sdk_pd ],
    inputs=[h_ts_1_dr,h_ts_2_dr,h_ts_3_dr,
            d_ts_1_dr,d_ts_2_dr],
    outputs=[d_use_case_1_dataprep_sdk_pd],
    compute_target=aml_compute,
    source_directory=source_directory_dataprep,
    runconfig=run_config
)

print("== PythonScriptStep use_case_1_dataprep_step created")

### Build the pipeline and submit an Experiment run

In [None]:
pipeline = Pipeline(workspace=ws, steps=[use_case_1_dataprep_sdk_step])
print ("== Pipeline is built")

In [None]:
pipeline_run = Experiment(ws, 'use-case-1-dataprep-sdk').submit(pipeline)
print("== Pipeline is submitted for execution")

In [None]:
RunDetails(pipeline_run).show()

### Wait for pipeline run to complete

In [None]:
pipeline_run.wait_for_completion(show_output=True)

### See Outputs

See where outputs of each pipeline step are located on your datastore.

***Wait for pipeline run to complete, to make sure all the outputs are ready***

In [None]:
# Get Steps
for step in pipeline_run.get_steps():
    print("== Outputs of step " + step.name)
    
    # Get a dictionary of StepRunOutputs with the output name as the key 
    output_dict = step.get_outputs()
    
    for name, output in output_dict.items():
        output_reference = output.get_port_data_reference() # Get output port data reference
        print("\tname: " + name)
        print("\tdatastore: " + output_reference.datastore_name)
        print("\tpath on datastore: " + output_reference.path_on_datastore)

In [None]:
# REGISTER a new version of the final output as a Dataset

from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath

# find output dataset
for step in pipeline_run.get_steps():
    output_dict = step.get_outputs()
    for name, output in output_dict.items():
        if name == 'd_use_case_1_dataprep_sdk':
            # generate a Tabular DataSet for it
            output_reference = output.get_port_data_reference()
            datastore_path = [DataPath(blob_store, output_reference.path_on_datastore)]
            ds = Dataset.Tabular.from_delimited_files(datastore_path)
            dataset_name = 'd_use_case_1_dataprep_sdk'
            ds.register(ws, name=dataset_name, create_new_version=True)
            print("== Registered new version of dataset: " + dataset_name)