# Import required libraries

In [1]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import command
from azure.ai.ml import Input
from azure.ai.ml import load_component
from azure.ai.ml import MLClient
from azure.ai.ml.entities import AmlCompute
from azure.ai.ml.entities import Environment

import os


# Configure Credentials

In [2]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()


# Configure workspace details and get a handle to the workspace

In [3]:
try:
    ml_client = MLClient.from_config(credential=credential)
except Exception as ex:
    # enter details of your AML workspace
    subscription_id = "c4a0cc8e-60ab-48fe-aee1-eb8752ad671a"
    resource_group = "ans-poc-gg-dp-dev"
    workspace = "gg-workspace"

    # get a handle to the workspace
    ml_client = MLClient(credential, subscription_id, resource_group, workspace)

print(ml_client)


Found the config file in: ./config.json
Class WorkspaceHubOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x1046407c0>,
         subscription_id=c4a0cc8e-60ab-48fe-aee1-eb8752ad671a,
         resource_group_name=ans-poc-gg-dp-dev,
         workspace_name=gg-workspace)


# Environment

In [4]:
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)


## Create yaml file

In [12]:
%%writefile {dependencies_dir}/conda.yaml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - mlflow== 2.6.0
    - azureml-mlflow==1.53.0
    - psutil>=5.8,<5.9
    - tqdm>=4.59,<4.60
    - ipykernel~=6.0
    - matplotlib

Overwriting ./dependencies/conda.yaml


## Register Environment

In [13]:
custom_env_name = "aml-scikit-learn"

custom_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Defaults job",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
)
custom_job_env = ml_client.environments.create_or_update(custom_job_env)

print(
    f"Environment with name {custom_job_env.name} is registered to workspace, the environment version is {custom_job_env.version}"
)


Environment with name aml-scikit-learn is registered to workspace, the environment version is 4


# Compute

In [14]:
# Name assigned to the compute cluster
cpu_compute_target = "cpu-cluster"

try:
    # let's see if the compute target already exists
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"You already have a cluster named {cpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new cpu compute target...")

    # Let's create the Azure Machine Learning compute object with the intended parameters
    # if you run into an out of quota error, change the size to a comparable VM that is available.
    # Learn more on https://azure.microsoft.com/en-us/pricing/details/machine-learning/.
    cpu_cluster = AmlCompute(
        name=cpu_compute_target,
        # Azure Machine Learning Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_DS3_V2",
        # Minimum running nodes when there is no job running
        min_instances=0,
        # Nodes in cluster
        max_instances=4,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )
    print(
        f"AMLCompute with name {cpu_cluster.name} will be created, with compute size {cpu_cluster.size}"
    )
    # Now, we pass the object to MLClient's create_or_update method
    cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster)

You already have a cluster named cpu-cluster, we'll reuse it as is.


# Training Script

In [15]:
train_src_dir = "./src"
os.makedirs(train_src_dir, exist_ok=True)


In [24]:
%%writefile {train_src_dir}/main.py

# imports
import os
import mlflow
import argparse

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# define functions
def main(args):
    # enable auto logging
    mlflow.autolog()

    # setup parameters
    params = {
        "fit_intercept": args.fit_intercept,
        "normalize": args.normalize,
        "positive": args.positive,
    }

    # read in data
    df = pd.read_csv(args.data)

    # process data
    X_train, X_test, y_train, y_test = process_data(df, args.random_state)

    # train model
    model = train_model(params, X_train, X_test, y_train, y_test)

    # Stop Logging
    # mlflow.end_run()

def process_data(df, random_state):
    # split dataframe into X and y
    X = df.drop(["target"], axis=1)
    y = df["target"]

    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state
    )

    # return splits and encoder
    return X_train, X_test, y_train, y_test

def train_model(params, X_train, X_test, y_train, y_test):
    # train model
    model = LinearRegression(**params)
    model = model.fit(X_train, y_train)

    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=model,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=model,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )

    # return model
    return model

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--data", type=str)
    parser.add_argument("--random_state", type=int, default=42)
    parser.add_argument("--fit_intercept", type=bool, default=True)
    parser.add_argument("--normalize", type=bool, default=False)
    parser.add_argument("--positive", type=bool, default=False)
    parser.add_argument("--registered_model_name", type=str, help="model name")

    # parse args
    args = parser.parse_args()

    # return args
    return args

# run script
if __name__ == "__main__":
    # parse args
    args = parse_args()

    # run main function
    main(args)


Overwriting ./src/main.py


# Configure the Command

In [25]:
registered_model_name = "diabetes_model"

job = command(
    inputs=dict(
        data=Input(
            type="uri_file",
            path="https://azuremlexamples.blob.core.windows.net/datasets/diabetes.csv",
        ),
        registered_model_name=registered_model_name,
    ),
    code="./src/",  # location of source code
    command="python main.py --data ${{inputs.data}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="aml-scikit-learn@latest",
    compute=cpu_compute_target
    if (cpu_cluster)
    else None,  # No compute needs to be passed to use serverless
    display_name=registered_model_name,
    experiment_name="Diabetes"
)


# Submit the job

In [26]:
ml_client.create_or_update(job)


[32mUploading src (0.0 MBs): 100%|██████████| 2390/2390 [00:00<00:00, 65109.91it/s]
[39m



Experiment,Name,Type,Status,Details Page
Diabetes,sharp_map_zzm4rfv127,command,Starting,Link to Azure Machine Learning studio
