#### Run az login to get credentials

In [None]:
!az login

In [22]:
from azureml.core.authentication import AzureCliAuthentication
auth = AzureCliAuthentication()

#### Get subscription id

In [None]:
import subprocess
import json
subscriptions = json.loads(subprocess.check_output('az account list', shell=True).decode('utf-8'))
subscription_id = str(subscriptions[0]['id'])
subscription_id

#### Configuration information

In [23]:
conf = {
    "subscription_id": subscription_id,
    "resource_group": "rg-demo",
    "workspace_name": "ws-demo",
    "compute_name": "compute-demo",
    "location": "eastus",
    "vm_size": "STANDARD_D2_V2",
    "pip_packages":['pandas', 'scikit-learn', 'azureml-sdk', 'azureml-dataset-runtime[fuse,pandas]'],
    "experiment_name": "DemoTraining",
    "min_nodes":1,
    "max_nodes":2,
    "pipeline_name": "BostonRegression",
    "pipeline_description": "Linear Regression for Boston Dataset"
}

#### Create workspace & resource group

In [24]:
from azureml.core import Workspace

try:
    ws = Workspace.get(
        auth=auth, 
        name=conf['workspace_name'],
        subscription_id=conf['subscription_id'],
        resource_group=conf['resource_group'])
except Exception as e:
    print(f"Creating workspace {conf['workspace_name']}")
    ws = Workspace.create(
        auth=auth,
        name=conf['workspace_name'],
        subscription_id=conf['subscription_id'],
        resource_group=conf['resource_group'],
        create_resource_group=True,
        location='eastus')

#### Create Compute target

In [25]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget, ComputeInstance
from azureml.core.compute_target import ComputeTargetException

try:
    compute_target = ComputeInstance(workspace=ws, name=conf['compute_name'])
    print('Found existing instance, use it.')
except ComputeTargetException:
    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size=conf['vm_size'],
        min_nodes=conf['min_nodes'], 
        max_nodes=conf['max_nodes'])
    compute_target = ComputeTarget.create(ws, conf['compute_name'], provisioning_config)
    compute_target.wait_for_completion(show_output=True)

Found existing instance, use it.


#### Load dataset

In [26]:
from sklearn.datasets import load_boston
import pandas as pd

boston = load_boston()
df_input = pd.DataFrame(columns=[f"feat_{i}" for i in range(boston['data'].shape[1])] ,data=boston['data'])
df_target = pd.DataFrame(columns=["label"] ,data=boston['target'])
print(df_input, df_target)

      feat_0  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  \
0    0.00632    18.0    2.31     0.0   0.538   6.575    65.2  4.0900     1.0   
1    0.02731     0.0    7.07     0.0   0.469   6.421    78.9  4.9671     2.0   
2    0.02729     0.0    7.07     0.0   0.469   7.185    61.1  4.9671     2.0   
3    0.03237     0.0    2.18     0.0   0.458   6.998    45.8  6.0622     3.0   
4    0.06905     0.0    2.18     0.0   0.458   7.147    54.2  6.0622     3.0   
..       ...     ...     ...     ...     ...     ...     ...     ...     ...   
501  0.06263     0.0   11.93     0.0   0.573   6.593    69.1  2.4786     1.0   
502  0.04527     0.0   11.93     0.0   0.573   6.120    76.7  2.2875     1.0   
503  0.06076     0.0   11.93     0.0   0.573   6.976    91.0  2.1675     1.0   
504  0.10959     0.0   11.93     0.0   0.573   6.794    89.3  2.3889     1.0   
505  0.04741     0.0   11.93     0.0   0.573   6.030    80.8  2.5050     1.0   

     feat_9  feat_10  feat_11  feat_12 


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

#### Create datastore

In [27]:
df_input.to_parquet('./input.parquet')
df_target.to_parquet('./target.parquet')

default_datastore = ws.get_default_datastore()

default_datastore.upload_files(files = ['./input.parquet'],
                       target_path = 'train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)

default_datastore.upload_files(files = ['./target.parquet'],
                       target_path = 'train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)

Uploading an estimated of 1 files
Uploading ./input.parquet
Uploaded ./input.parquet, 1 files out of an estimated total of 1
Uploaded 1 files
Uploading an estimated of 1 files
Uploading ./target.parquet
Uploaded ./target.parquet, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_b8e5657227c44c9daa81649bf3113dd9

In [28]:
from azureml.exceptions import UserErrorException
from azureml.core import Datastore

blob_datastore_name=default_datastore.name
blob_datastore_container_name=default_datastore.container_name
blob_datastore_account_name=default_datastore.account_name

# blob_datastore_account_name
# blob_datastore_container_name
# blob_account_key
# blob_datastore_name (given by the user)

try:
    blob_datastore = Datastore.get(ws, blob_datastore_name)
    print("Found Blob Datastore with name: %s" % blob_datastore_name)
except UserErrorException:
    blob_datastore = Datastore.register_azure_blob_container(
       workspace=ws,
       datastore_name=blob_datastore_name,
       account_name=blob_datastore_account_name, # Storage account name
       container_name=blob_datastore_container_name, # Name of Azure blob container
       account_key=blob_account_key) # Storage account key
    print("Registered blob datastore with name: %s" % blob_datastore_name)

Found Blob Datastore with name: workspaceblobstore


#### Create Dataset

In [29]:
from azureml.core import Dataset
df_input = Dataset.Tabular.from_parquet_files(
    validate=False, 
    path = [(default_datastore, 'train-dataset/tabular/input.parquet')])

df_target = Dataset.Tabular.from_parquet_files(
    validate=False, 
    path = [(default_datastore, 'train-dataset/tabular/target.parquet')])

df_input.register(workspace=ws, name='df_input', create_new_version=True)
df_target.register(workspace=ws, name='df_target', create_new_version=True)

#### Create Environment

In [30]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

pip_packages = conf['pip_packages']
env = Environment(name='custom_env')
env_dep = CondaDependencies.create(
    pip_packages=pip_packages
)
env.python.conda_dependencies = env_dep
env.register(ws)
run_config = RunConfiguration()
run_config.target = compute_target
run_config.environment = env

#### Pipeline

Step 1 of Pipeline

In [31]:
%%writefile preprocess.py
import os
from argparse import ArgumentParser
from azureml.core import Run

parser = ArgumentParser()
parser.add_argument('--input', type=str, dest='input')
parser.add_argument('--target', type=str, dest='target')
parser.add_argument('--output', type=str, dest='output')
parser.add_argument('--output_csv', type=str, dest='output_csv')
args = parser.parse_args()

run = Run.get_context()
input_data = run.input_datasets[args.input]
df = input_data.to_pandas_dataframe()
normalized_df=(df-df.min())/(df.max()-df.min())

# Log number of features
run.log("Number of features", len(df.columns))

# Output final dataframe for training step
os.makedirs(args.output, exist_ok=True)

# Create the path
path = os.path.join(args.output, args.output_csv)

# Write the data preparation output as csv file
normalized_df.to_csv(path, index=False)

run.complete()

Overwriting preprocess.py


Step2 of Pipeline

In [32]:
%%writefile train.py
import os
from argparse import ArgumentParser
from azureml.core import Run
from sklearn.linear_model import LinearRegression
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib

parser = ArgumentParser()
parser.add_argument('--input', type=str, dest='input')
parser.add_argument('--input_csv', type=str, dest='input_csv')
parser.add_argument('--target', type=str, dest='target')
parser.add_argument('--output', type=str, dest='output')
args = parser.parse_args()

run = Run.get_context()
path = os.path.join(args.input, args.input_csv)
df_input = pd.read_csv(path)
target_data = run.input_datasets[args.target]
df_target = target_data.to_pandas_dataframe()

X_train, X_test, y_train, y_test = train_test_split(df_input, df_target, random_state=1)
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

# Register model to workspace
os.makedirs(args.output, exist_ok=True)
path = os.path.join(args.output, 'regression_model.pkl')
joblib.dump(value=regr, filename=path)
run.upload_file(f"{args.output}/regression_model.pkl", f"{args.output}/regression_model.pkl")
model = run.register_model(model_path=f'{args.output}/regression_model.pkl',
                           model_name='regression_model_boston',
                           tags={'source': 'SDK Run', 'algorithm': 'Linear Regression'})

run.log("R2 Score", r2_score(y_test, y_pred))
run.log("MAE", median_absolute_error(y_test, y_pred))

run.complete()

Overwriting train.py


In [33]:
from azureml.core import Dataset
# Get a dataset for the initial data
input_ds = Dataset.get_by_name(ws, 'df_input')
target_ds = Dataset.get_by_name(ws, 'df_target')

In [34]:
from azureml.pipeline.core import PipelineData
data_store = ws.get_default_datastore()
prοcessed_data = PipelineData('processed_data', datastore=data_store)

In [35]:
import os 
from azureml.pipeline.steps import PythonScriptStep
step1 = PythonScriptStep(
    name = 'process data',
    source_directory = os.getcwd(),
    script_name = 'preprocess.py',
    compute_target = conf['compute_name'],
    allow_reuse=False,
    arguments = [
        '--input', 'df_input',
        '--target', 'df_target',
        '--output', prοcessed_data,
        '--output_csv', 'prοcessed_data.csv',
    ],
    inputs=[
        input_ds.as_named_input('df_input'),
        target_ds.as_named_input('df_target'),
    ],
    outputs=[prοcessed_data],
    runconfig=run_config)

In [36]:
import os 
from azureml.pipeline.steps import PythonScriptStep
step2 = PythonScriptStep(
    name = 'train data',
    source_directory = os.getcwd(),
    script_name = 'train.py',
    compute_target = conf['compute_name'],
    allow_reuse=False,
    arguments = [
        '--input', prοcessed_data,
        '--input_csv', 'prοcessed_data.csv',
        '--target', 'df_target',
        '--output', 'models',
    ],
    inputs=[
        prοcessed_data,
        target_ds.as_named_input('df_target'),
    ],
    runconfig=run_config)

In [37]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(workspace=ws, steps=[step1, step2])

In [38]:
from azureml.core import Experiment
experiment = Experiment(ws, conf['experiment_name'])
experiment_submission = experiment.submit(pipeline, continue_on_step_failure=False)
experiment_submission.wait_for_completion(show_output=True)

Created step process data [502fb208][f614bfae-eb91-4559-a647-cdcc982c08c0], (This step will run and generate new outputs)
Created step train data [79a3cc0c][7c9889ca-74ad-4e92-b42f-878e52acb176], (This step will run and generate new outputs)
Submitted PipelineRun 1a36e2d2-710a-4cb4-898a-4a2c868debf2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1a36e2d2-710a-4cb4-898a-4a2c868debf2?wsid=/subscriptions/8f84dc93-7554-4327-ad30-d41ea51a66c5/resourcegroups/rg-demo/workspaces/ws-demo&tid=0f664476-a5c7-4423-ae3e-0b4b7e0855de
PipelineRunId: 1a36e2d2-710a-4cb4-898a-4a2c868debf2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1a36e2d2-710a-4cb4-898a-4a2c868debf2?wsid=/subscriptions/8f84dc93-7554-4327-ad30-d41ea51a66c5/resourcegroups/rg-demo/workspaces/ws-demo&tid=0f664476-a5c7-4423-ae3e-0b4b7e0855de
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: b0e4e3e3-581b-483a-a7bb-81d3ae322e20
Link to Azure Machine Learning Portal: https://ml.azur




StepRunId: e8c670b9-9355-4bcd-8240-0237eda0c151
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e8c670b9-9355-4bcd-8240-0237eda0c151?wsid=/subscriptions/8f84dc93-7554-4327-ad30-d41ea51a66c5/resourcegroups/rg-demo/workspaces/ws-demo&tid=0f664476-a5c7-4423-ae3e-0b4b7e0855de
StepRun( train data ) Status: Running

StepRun(train data) Execution Summary
StepRun( train data ) Status: Finished

This run might be using a new job runtime with improved performance and error reporting. The logs from your script are in user_logs/std_log.txt. Please let us know if you run into any issues, and if you would like to opt-out, please add the environment variable AZUREML_COMPUTE_USE_COMMON_RUNTIME to the environment variables section of the job and set its value to the string "false"





PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '1a36e2d2-710a-4cb4-898a-4a2c868debf2', 'status': 'Completed', 'startTimeUtc': '2022-01-05T11:54:54.344604Z', 'endTimeUtc': '2022-01-05T11:56:13.864468Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://wsdemostoragefb27534f3f4.blob.core.windows.net/azureml/ExperimentRun/dcid.1a36e2d2-710a-4cb4-898a-4a2c868debf2/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=Z%2B7W3j6j7YahRJi7eHi2tGY8nIL2lApzsxUu3YVr940%3D&skoid=1435b405-43f9-4dd6-9218-82e53ca77f43&sktid=0f664476-a5c7-4423-ae3e-0b4b7e0855de&skt=2022-01-05T08%3A49%3A24Z&ske=2022-01-06T16%3A59%3A24Z&sks=b&skv=2019-07-07&st=2022-01-05T11%3A46%3A17Z&se=2022-01-05T19%3A56%3A17Z&sp=r', 'logs/azureml/stde

'Finished'

#### Publish a pipeline

In [39]:
published_pipeline = pipeline.publish(
    name=conf['pipeline_name'],
    description=conf['pipeline_description']
)
published_pipeline.endpoint

'https://eastus.api.azureml.ms/pipelines/v1.0/subscriptions/8f84dc93-7554-4327-ad30-d41ea51a66c5/resourceGroups/rg-demo/providers/Microsoft.MachineLearningServices/workspaces/ws-demo/PipelineRuns/PipelineSubmit/db00a168-b18c-42b4-b625-72579fc25e3e'

In [40]:
import requests
import json

response = requests.post(published_pipeline.endpoint,
                         json={"ExperimentName": conf["experiment_name"]},
                         headers=auth.get_authentication_header()
                         )
response.json()

{'Description': None,
 'Status': {'StatusCode': 0,
  'StatusDetail': None,
  'CreationTime': '2022-01-05T11:56:24.3744922Z',
  'EndTime': None},
 'GraphId': 'f3937770-af99-4985-865e-83bb7b56e297',
 'IsSubmitted': False,
 'HasErrors': False,
 'UploadState': 0,
 'ParameterAssignments': {},
 'DataPathAssignments': {},
 'DataSetDefinitionValueAssignments': {},
 'RunHistoryExperimentName': 'DemoTraining',
 'DisplayName': None,
 'PipelineRunId': 'a6075cc1-7236-4c0e-a3d2-a3c578cd4ba5',
 'PipelineId': 'db00a168-b18c-42b4-b625-72579fc25e3e',
 'PipelineEndpointId': None,
 'RunSource': 'Unavailable',
 'RunType': 0,
 'TotalRunSteps': 2,
 'ScheduleId': None,
 'RunUrl': 'https://ml.azure.com/experiments/DemoTraining/runs/a6075cc1-7236-4c0e-a3d2-a3c578cd4ba5?tid=0f664476-a5c7-4423-ae3e-0b4b7e0855de&wsid=/subscriptions/8f84dc93-7554-4327-ad30-d41ea51a66c5/resourcegroups/rg-demo/workspaces/ws-demo',
 'tags': {},
 'StepTags': {},
 'Properties': {},
 'StepProperties': {},
 'CreatedBy': {'UserObjectId': '