In [1]:
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()

In [2]:
# Parameters (make sure these are correct)
subscr_id = "" 
resc_name = ""
worksp_name = ""

# Get a handle to the workspace
ml_client = MLClient(
    credential = credential,
    subscription_id = subscr_id,
    resource_group_name = resc_name,
    workspace_name = worksp_name,
)

# ws 
# from azureml.core import Workspace
# ws = Workspace(subscr_id, resc_name, worksp_name)

In [None]:
### First run az login in a command prompt to make sure you have authenticated using the Azure CLI

from azure.ai.ml.entities import AmlCompute

cpu_compute_target = ""
cpu_size = ""

gpu_compute_target = ""
gpu_size = ""

def create_compute(target, size):
    try:
        # let's see if the compute target already exists
        cluster = ml_client.compute.get(target)
        print(
            f"You already have a cluster named {target}, we'll reuse it as is."
        )

    except Exception:
        print("Creating a new compute target...")

        # Let's create the Azure ML compute object with the intended parameters
        cluster = AmlCompute(
            # Name assigned to the compute cluster
            name=target,
            # Azure ML Compute is the on-demand VM service
            type="amlcompute",
            # VM Family
            size=size,
            # Minimum running nodes when there is no job running
            min_instances=0,
            # Nodes in cluster
            max_instances=4,
            # How many seconds will the node running after the job termination
            idle_time_before_scale_down=180,
            # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
            tier="Dedicated",
        )

        # Now, we pass the object to MLClient's create_or_update method
        cluster = ml_client.begin_create_or_update(cluster).result()

    print(
        f"AMLCompute with name {cluster.name} is created, the compute size is {cluster.size}"
    )
    
    return target
    
compute_target = create_compute(gpu_compute_target, gpu_size)

In [7]:
# For job
env_name = ""

# Or
# from azureml.core import Environment
# mycustomenv = Environment(name="")

In [8]:
from azure.ai.ml import MLClient, command, Input
from azure.ai.ml import command, Input, MLClient, UserIdentityConfiguration, ManagedIdentityConfiguration
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.identity import DefaultAzureCredential
import os

# Load from psth
data_path = r"wasbs:"

# Load from data asset 
# train_data_asset = ml_client.data.get("test_dataasset", version="1")
# data_path = train_data_asset.id

# If Azure container, you may need identity
identity = UserIdentityConfiguration() # Use the user's identity
# identity = ManagedIdentityConfiguration() # Use the compute target managed identity

In [None]:
from azure.ai.ml import command
from azure.ai.ml import Input
import os
import json

# Set job and run
job = command(
     code=".", 
     command='python yolov8_train.py --traindata ${{inputs.train_data}} --epochs ${{inputs.epochs}} --batch ${{inputs.batch}} --lr0 ${{inputs.lr0}} --imgsz ${{inputs.imgsz}}',
     inputs={
         "train_data": Input(path=data_path,
             type=AssetTypes.URI_FOLDER,
             mode=InputOutputModes.RO_MOUNT # Cannot use RW_MOUNT?
         ),
         "epochs": 1,
         "batch": 5,
         "lr0": 0.0033787, # 0.0033787 used for best detectron2
         "imgsz": 538,
     },
     compute=compute_target,
     environment=env_name,
     outputs={},  # Add any required outputs here
     identity=identity,
     experiment_name="yolov8-mc", # cannot use deleted names
 )

ml_client.jobs.create_or_update(job)

In [8]:
from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy

job_for_sweep = job(
    epochs=Choice(values=[1, 10]),
    batch=Choice(values=[5]),
    lr0=Choice(values=[0.0033787]),
    imgsz=Choice(values=[538])
)

In [None]:
# apply the sweep parameter to obtain the sweep_job
sweep_job = job_for_sweep.sweep(
    compute=compute_target,
    sampling_algorithm="random",
    primary_metric="total val loss",
    goal="Minimize",
)

# define the limits for this sweep
sweep_job.set_limits(max_total_trials=20, max_concurrent_trials=1)

# define stopping policy
# sweep_job.early_termination = MedianStoppingPolicy(delay_evaluation = int(num_epochs/4), evaluation_interval = int(num_epochs/4))

# submit the sweep
ml_client.create_or_update(sweep_job)