In [1]:
from azure.ai.ml import load_component
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import MLClient, Input
from azure.identity import DefaultAzureCredential, EnvironmentCredential
from azure.ai.ml.entities import AmlCompute
import os

In [2]:
# Get a handle to workspace
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [3]:
# Create environment
environment_path = (".")

environment_name = "project-Environment"
try:
    ml_client.environments.get(name=environment_name, version="1")
except Exception:
    print(f"Cannot find environment {environment_name} in workspace, creating it")
    from azure.ai.ml.entities import Environment

    environment = Environment(
        image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20220303.v1",
        conda_file=f"{environment_path}/environment/conda_envs/python-project-environment.yaml",
        name=environment_name,
        version="1",
        description="Custom environment for eda",
    )
    ml_client.environments.create_or_update(environment)

Cannot find environment project-Environment in workspace, creating it


In [6]:
# Define a comput
def get_comput_target(ml_client, name="cpu-cluster", family='Standard_D2_v2'):
    cpu_compute_target = name
    
    try:
        # let's see if the compute target already exists
        cpu_cluster = ml_client.compute.get(cpu_compute_target)
    except Exception:
        cpu_cluster = AmlCompute(
            name=cpu_compute_target,
            type="amlcompute",
            size=family,
            min_instances=0,
            max_instances=2,
            idle_time_before_scale_down=180,
            tier="Dedicated",
        )
    
        cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster).result()

In [7]:
compute_target = get_comput_target(ml_client)

In [8]:
# Components
fill_eda_component = load_component(source="./fill_eda-component/fill_eda.yml")
split_component = load_component(source="./split-component/split.yml")
train_lr_component = load_component(source="./train_LogisticRegression_component/train_LogisticRegression.yml")
train_dt_component = load_component(source="./train_DecisionTree_component/train_DecisionTree.yml")
score_component = load_component(source="./score-component/score.yml")
eval_component = load_component(source="./eval-component/eval.yml")

In [9]:
# define a pipeline: Fill, split, train LR, train DT, score, eval
@pipeline(
    default_compute='cpu-cluster',
)
def water_potability_prediction(pipeline_input_data):     
        
    fill_node=fill_eda_component(
        data_set = pipeline_input_data,
    )    

    split_node=split_component(
        data_set = fill_node.outputs.data_clean,
    )
    
    train_lr_node=train_lr_component(
        data_train = split_node.outputs.data_train,
    )

    train_dt_node=train_dt_component(
        data_train = split_node.outputs.data_train,
        criterion = "entropy", 
        min_samples_split= 3, 
        max_depth=4,
    )        

    score_lr_node=score_component(
        model_input=train_lr_node.outputs.model_output_lr_pickle,
        test_data=split_node.outputs.data_test,
    )

    score_dt_node=score_component(
        model_input=train_dt_node.outputs.model_output_dt_pickle,
        test_data=split_node.outputs.data_test,
    )

    eval_lr_node=eval_component(
        scoring_result=score_lr_node.outputs.score_output
    )

    eval_dt_node = eval_component(
        scoring_result=score_dt_node.outputs.score_output
    )

    return {
        "pipeline_fill_pairplot_output" : fill_node.outputs.pairplot_fig,  
        "pipeline_model_pkl_lr_output" : train_lr_node.outputs.model_output_lr_pickle,     
        "pipeline_model_pkl_dt_output" : train_dt_node.outputs.model_output_dt_pickle,
        "pipeline_eval_lr_output": eval_lr_node.outputs.eval_output,
        "pipeline_eval_dt_output": eval_dt_node.outputs.eval_output
    }

In [10]:
# create a pipeline
water_potability_ds =  Input(
            type="uri_file",
            path="azureml://subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-proyecto-2-wp/datastores/workspaceblobstore/paths/UI/2023-11-08_121450_UTC/water_potability_ds.csv",
        )
pipeline_job = water_potability_prediction(pipeline_input_data=water_potability_ds)

In [11]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_water_potability_p2"
)
pipeline_job

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading fill_eda_src (0.0 MBs):

Experiment,Name,Type,Status,Details Page
pipeline_water_potability_p2,magenta_celery_rsgvqtr2ck,pipeline,Preparing,Link to Azure Machine Learning studio


In [12]:
# wait until the job completes
ml_client.jobs.stream(pipeline_job.name)

RunId: magenta_celery_rsgvqtr2ck
Web View: https://ml.azure.com/runs/magenta_celery_rsgvqtr2ck?wsid=/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-proyecto-2-wp

Execution Summary
RunId: magenta_celery_rsgvqtr2ck
Web View: https://ml.azure.com/runs/magenta_celery_rsgvqtr2ck?wsid=/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-proyecto-2-wp



In [13]:
# Download all the outputs of the job
output = ml_client.jobs.download(name=pipeline_job.name, download_path='./pipeline_output', all=True)

Downloading artifact azureml://subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-proyecto-2-wp/datastores/workspaceblobstore/paths/azureml/050f908a-7e6b-4965-b02e-fa9f798f2b95/pairplot_fig/ to pipeline_output/named-outputs/pipeline_fill_pairplot_output
Downloading artifact azureml://subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-proyecto-2-wp/datastores/workspaceblobstore/paths/azureml/b32f90fa-1317-4f22-8ffa-7334b31a65f3/model_output_lr_pickle/ to pipeline_output/named-outputs/pipeline_model_pkl_lr_output
Downloading artifact azureml://subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-proyecto-2-wp/datastores/workspaceblobstore/paths/azureml/e914f517-b280-4be0-a0fd-d4291fcb8f2f/model_output_dt_pickle/ to pipeline_output/named-outputs/pipeline_model_pkl_dt_output
Downloading artifact azureml://subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourceg