In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.43.0 to work with Ali_workspace_student


## Prepare data


In [2]:
from azureml.core import Dataset
from azureml.data.datapath import DataPath

default_ds = ws.get_default_datastore()

if 'diabetes dataset' not in ws.datasets:
    Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds, 'diabetes-data/')
                              )

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='diabetes dataset',
                                description='diabetes data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Dataset already registered.


In [3]:
for ds in ws.datasets:
    print(ds)

diabetes file dataset
diabetes dataset


## Create scripts for pipeline steps


In [4]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'diabetes_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

diabetes_pipeline


In [5]:
%%writefile $experiment_folder/prep_diabetes.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(diabetes))
run.log('raw_rows', row_count)

# remove nulls
diabetes = diabetes.dropna()

# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
diabetes[num_cols] = scaler.fit_transform(diabetes[num_cols])

# Log processed rows
row_count = (len(diabetes))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
diabetes.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

Writing diabetes_pipeline/prep_diabetes.py


In [6]:
%%writefile $experiment_folder/train_diabetes.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'data.csv')
diabetes = pd.read_csv(file_path)

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train adecision tree model
print('Training a decision tree model...')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'diabetes_model.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'diabetes_model',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})


run.complete()

Writing diabetes_pipeline/train_diabetes.py


## Prepare a compute environment for the pipeline steps


In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "anabavib1"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

Found existing cluster, use it.


In [8]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Writing diabetes_pipeline/experiment_env.yml


Now that you have a Conda configuration file, you can create an environment and use it in the run configuration for the pipeline.

In [9]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


## Create and run a pipeline


In [10]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_diabetes.py",
                                arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_diabetes.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [11]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [b21732f6][2bc014f2-0901-4696-8717-8a7a05bb0701], (This step will run and generate new outputs)Created step Train and Register Model [a380ff1b][ed859403-2df6-4752-a5b8-41e724209469], (This step will run and generate new outputs)

Submitted PipelineRun 28e74074-91be-47db-9db1-8be0cd9b2e83
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/28e74074-91be-47db-9db1-8be0cd9b2e83?wsid=/subscriptions/1b822ee6-a558-4a5d-8ad7-7af385162a03/resourcegroups/Ali_Student_Workspace/workspaces/Ali_workspace_student&tid=41f88ecb-ca63-404d-97dd-ab0a169fd138
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 28e74074-91be-47db-9db1-8be0cd9b2e83
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/28e74074-91be-47db-9db1-8be0cd9b2e83?wsid=/subscriptions/1b822ee6-a558-4a5d-8ad7-7af385162a03/resourcegroups/Ali_Student_Workspace/workspaces/Ali_workspace_student&tid=41f88ecb-ca63-404d-97dd-ab0a169fd138
PipelineRun Status: Running


StepRunId: fb15e589-1b05-4b50-bab3-86b96887b60e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/fb15e589-1b05-4b50-bab3-86b96887b60e?wsid=/subscriptions/1b822ee6-a558-4a5d-8ad7-7af385162a03/resourcegroups/Ali_Student_Workspace/workspaces/Ali_workspace_student&tid=41f88ecb-ca63-404d-97dd-ab0a169fd138
StepRun( Prepare Data ) Status: Running

Streaming azureml-logs/20_image_build_log.txt
2022/07/21 22:47:28 Downloading source code...
2022/07/21 22:47:29 Finished downloading source code
2022/07/21 22:47:29 Creating Docker network: acb_default_network, driver: 'bridge'
2022/07/21 22:47:30 Successfully set up Docker network: acb_


qt-5.9.6             | 67.3 MB   |            |   0% 
qt-5.9.6             | 67.3 MB   | #2         |  12% 
qt-5.9.6             | 67.3 MB   | ###4       |  35% 
qt-5.9.6             | 67.3 MB   | #####6     |  57% 
qt-5.9.6             | 67.3 MB   | #######8   |  78% 
qt-5.9.6             | 67.3 MB   | ########## | 100% 
qt-5.9.6             | 67.3 MB   | ########## | 100% 

pcre-8.45            | 207 KB    |            |   0% 
pcre-8.45            | 207 KB    | ########## | 100% 

ipython_genutils-0.2 | 27 KB     |            |   0% 
ipython_genutils-0.2 | 27 KB     | ########## | 100% 

ipykernel-5.3.4      | 181 KB    |            |   0% 
ipykernel-5.3.4      | 181 KB    | ########## | 100% 

libffi-3.2.1         | 48 KB     |            |   0% 
libffi-3.2.1         | 48 KB     | ########## | 100% 

readline-7.0         | 848 KB    |            |   0% 
readline-7.0         | 848 KB    | ########## | 100% 

numpy-1.19.2         | 22 KB     |            |   0% 
numpy-1.19.2         


Removing intermediate container 5269bf3ea35e
 ---> 5840d43e5d78
Step 9/21 : ENV PATH /azureml-envs/azureml_0c5a9aa2def4b3c2501c1f40287a356b/bin:$PATH
 ---> Running in 1e9093d3bc7d
Removing intermediate container 1e9093d3bc7d
 ---> 4587b970c32f
Step 10/21 : COPY azureml-environment-setup/send_conda_dependencies.py azureml-environment-setup/send_conda_dependencies.py
 ---> b138950d4a0d
Step 11/21 : RUN echo "Copying environment context"
 ---> Running in 2bec19450541
Copying environment context
Removing intermediate container 2bec19450541
 ---> e81765fce72c
Step 12/21 : COPY azureml-environment-setup/environment_context.json azureml-environment-setup/environment_context.json
 ---> ec02ee03ce4f
Step 13/21 : RUN python /azureml-environment-setup/send_conda_dependencies.py -p /azureml-envs/azureml_0c5a9aa2def4b3c2501c1f40287a356b
 ---> Running in 015e5c9fcc0e
Report materialized dependencies for the environment
Reading environment context
Exporting conda environment
Sending request with mat


Streaming azureml-logs/55_azureml-execution-tvmps_77563777a360101fa750901206cec33feaf46d3b23b13c69b3445fa53da928a8_d.txt
2022-07-21T22:57:45Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/ali_workspace_student/azureml/fb15e589-1b05-4b50-bab3-86b96887b60e/mounts/workspaceblobstore -- stdout/stderr: 
2022-07-21T22:57:45Z The vmsize standard_d4as_v4 is not a GPU VM, skipping get GPU count by running nvidia-smi command.
2022-07-21T22:57:45Z Starting output-watcher...
2022-07-21T22:57:45Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2022-07-21T22:57:46Z Executing 'Copy ACR Details file' on 10.0.0.4
2022-07-21T22:57:46Z Copy ACR Details file succeeded on 10.0.0.4. Output: 
>>>   
>>>   
70: Pulling from azureml/curated/sidecar
Digest: sha256:c5931fb3fb96642b435e66bd759c6595577396ede63973b46a987e6398dd8690
Status: Image is up to date for mcr.microsoft.com/azureml/curated/sidecar:70
mcr.microsoft.com/azureml/curated/sidecar:70
2022-07-


Streaming azureml-logs/75_job_post-tvmps_77563777a360101fa750901206cec33feaf46d3b23b13c69b3445fa53da928a8_d.txt
[2022-07-21T22:58:46.051744] Entering job release
[2022-07-21T22:58:46.734670] Starting job release
[2022-07-21T22:58:46.735041] Logging experiment finalizing status in history service.[2022-07-21T22:58:46.735369] job release stage : upload_datastore starting...
Starting the daemon thread to refresh tokens in background for process with pid = 265

[2022-07-21T22:58:46.735665] job release stage : start importing azureml.history._tracking in run_history_release.
[2022-07-21T22:58:46.735889] job release stage : execute_job_release starting...
[2022-07-21T22:58:46.736242] job release stage : copy_batchai_cached_logs starting...
[2022-07-21T22:58:46.737776] job release stage : copy_batchai_cached_logs completed...
[2022-07-21T22:58:46.746255] Entering context manager injector.
[2022-07-21T22:58:46.750369] job release stage : upload_datastore completed...
[2022-07-21T22:58:46.8179




StepRunId: 4db31ac8-1657-49c7-9e7b-a085cdbda284
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4db31ac8-1657-49c7-9e7b-a085cdbda284?wsid=/subscriptions/1b822ee6-a558-4a5d-8ad7-7af385162a03/resourcegroups/Ali_Student_Workspace/workspaces/Ali_workspace_student&tid=41f88ecb-ca63-404d-97dd-ab0a169fd138
StepRun( Train and Register Model ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_77563777a360101fa750901206cec33feaf46d3b23b13c69b3445fa53da928a8_d.txt
2022-07-21T22:59:05Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/ali_workspace_student/azureml/4db31ac8-1657-49c7-9e7b-a085cdbda284/mounts/workspaceblobstore -- stdout/stderr: 
2022-07-21T22:59:06Z The vmsize standard_d4as_v4 is not a GPU VM, skipping get GPU count by running nvidia-smi command.
2022-07-21T22:59:06Z Starting output-watcher...
2022-07-21T22:59:06Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2022-07-21T22:59:06Z Executing 'Co

2022-07-21T22:59:26Z job exited with code 0
2022-07-21T22:59:26Z Executing 'JobRelease task' on 10.0.0.4
2022-07-21T22:59:28Z The vmsize standard_d4as_v4 is not a GPU VM, skipping get GPU count by running nvidia-smi command.
2022-07-21T22:59:28Z The vmsize standard_d4as_v4 is not a GPU VM, skipping get GPU count by running nvidia-smi command.
2022-07-21T22:59:28Z JobRelease task succeeded on 10.0.0.4. Output: 
>>>   2022/07/21 22:59:26 Didn't get JobInfoJson from env, now read from file
>>>   2022/07/21 22:59:26 Suceeded read JobInfoJson from file
>>>   2022/07/21 22:59:26 Starting App Insight Logger for task:  jobRelease
>>>   2022/07/21 22:59:26 Version: 3.0.01999.0001 Branch: 20220623.1 Commit: 49a46b0
>>>   2022/07/21 22:59:26 Didn't get JobInfoJson from env, now read from file
>>>   2022/07/21 22:59:26 Suceeded read JobInfoJson from file
>>>   2022/07/21 22:59:26 SidecarEnabled:: isDetonationChamber: false, useDockerContainer: true
>>>   2022/07/21 22:59:26 SidecarEnabled:: AmlDat


Streaming azureml-logs/75_job_post-tvmps_77563777a360101fa750901206cec33feaf46d3b23b13c69b3445fa53da928a8_d.txt
[2022-07-21T22:59:26.293535] Entering job release
[2022-07-21T22:59:26.940684] Starting job release
[2022-07-21T22:59:26.941058] Logging experiment finalizing status in history service.
[2022-07-21T22:59:26.941209] job release stage : upload_datastore starting...Starting the daemon thread to refresh tokens in background for process with pid = 220

[2022-07-21T22:59:26.941863] job release stage : start importing azureml.history._tracking in run_history_release.
[2022-07-21T22:59:26.943327] job release stage : copy_batchai_cached_logs starting...[2022-07-21T22:59:26.945562] job release stage : execute_job_release starting...

[2022-07-21T22:59:26.950824] job release stage : copy_batchai_cached_logs completed...
[2022-07-21T22:59:26.951583] Entering context manager injector.
[2022-07-21T22:59:26.955342] job release stage : upload_datastore completed...
[2022-07-21T22:59:27.0282



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '28e74074-91be-47db-9db1-8be0cd9b2e83', 'status': 'Completed', 'startTimeUtc': '2022-07-21T22:47:22.537463Z', 'endTimeUtc': '2022-07-21T22:59:37.010338Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://aliworkspacest6241059068.blob.core.windows.net/azureml/ExperimentRun/dcid.28e74074-91be-47db-9db1-8be0cd9b2e83/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=mOFbR6KT5vpPG3j5F0zQdpxmot8nXDG0cpWJampEfFg%3D&skoid=89b037e1-e81a-4bae-a716-a38886961045&sktid=41f88ecb-ca63-404d-97dd-ab0a169fd138&skt=2022-07-21T21%3A31%3A59Z&ske=2022-07-23T05%3A41%3A59Z&sks=b&skv=2019-07-07&st=2022-07-21T

'Finished'

In [12]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 Accuracy : 0.8926666666666667
	 AUC : 0.8791133887258072
	 ROC : aml://artifactId/ExperimentRun/dcid.4db31ac8-1657-49c7-9e7b-a085cdbda284/ROC_1658444358.png
Prepare Data :
	 raw_rows : 10000
	 processed_rows : 10000


Assuming the pipeline was successful, a new model should be registered with a *Training context* tag indicating it was trained in a pipeline. Run the following code to verify this.

In [21]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

diabetes_model version: 14
	 Training context : Pipeline
	 AUC : 0.8728767600378128
	 Accuracy : 0.8863333333333333


diabetes_model version: 13
	 Training context : Compute cluster
	 AUC : 0.8755958007861088
	 Accuracy : 0.888


diabetes_model version: 12
	 Training context : File dataset
	 AUC : 0.8483198169063138
	 Accuracy : 0.774


diabetes_model version: 11
	 Training context : Tabular dataset
	 AUC : 0.8484024080800039
	 Accuracy : 0.7736666666666666


diabetes_model version: 10
	 Training context : File dataset
	 AUC : 0.8468519356081545
	 Accuracy : 0.7788888888888889


diabetes_model version: 9
	 Training context : Tabular dataset
	 AUC : 0.8569287675378097
	 Accuracy : 0.79


diabetes_model version: 8
	 Training context : Parameterized script
	 AUC : 0.8483198169063138
	 Accuracy : 0.774


diabetes_model version: 7
	 Training context : Parameterized script
	 AUC : 0.8483198169063138
	 Accuracy : 0.774


diabetes_model version: 6
	 Training context : Script
	 AUC : 0.84838996

## Publish the pipeline

After you've created and tested a pipeline, you can publish it as a REST service.

In [22]:
# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(
    name="diabetes-training-pipeline", description="Trains diabetes model", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
diabetes-training-pipeline,cac605a7-5e50-4f91-bca9-1b59a8756738,Active,REST Endpoint
