In [2]:
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()

In [3]:
# Parameters (make sure these are correct)
subscr_id = "" 
resc_name = ""
worksp_name = ""

# Get a handle to the workspace
ml_client = MLClient(
    credential = credential,
    subscription_id = subscr_id,
    resource_group_name = resc_name,
    workspace_name = worksp_name,
)

# ws 
# from azureml.core import Workspace
# ws = Workspace(subscr_id, resc_name, worksp_name)

In [None]:
### First run az login in a command prompt to make sure you have authenticated using the Azure CLI

from azure.ai.ml.entities import AmlCompute

cpu_compute_target = ""
cpu_size = ""

gpu_compute_target = ""
gpu_size = ""

def create_compute(target, size):
    try:
        # let's see if the compute target already exists
        cluster = ml_client.compute.get(target)
        print(
            f"You already have a cluster named {target}, we'll reuse it as is."
        )

    except Exception:
        print("Creating a new compute target...")

        # Let's create the Azure ML compute object with the intended parameters
        cluster = AmlCompute(
            # Name assigned to the compute cluster
            name=target,
            # Azure ML Compute is the on-demand VM service
            type="amlcompute",
            # VM Family
            size=size,
            # Minimum running nodes when there is no job running
            min_instances=0,
            # Nodes in cluster
            max_instances=4,
            # How many seconds will the node running after the job termination
            idle_time_before_scale_down=180,
            # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
            tier="Dedicated",
        )

        # Now, we pass the object to MLClient's create_or_update method
        cluster = ml_client.begin_create_or_update(cluster).result()

    print(
        f"AMLCompute with name {cluster.name} is created, the compute size is {cluster.size}"
    )
    
    return target
    
compute_target = create_compute(gpu_compute_target, gpu_size)

In [5]:
# For job
env_name = ""

In [6]:
from azure.ai.ml import MLClient, command, Input
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.identity import DefaultAzureCredential
import os

# Load from local
root_path = r""
data_set = "" # Folder in rootpath containing to be used images, masks, annotations (xm/ybg) 
data_path = os.path.join(root_path, data_set) 

data_path = r""

# Load from data asset 
# train_data_asset = ml_client.data.get("test_dataasset", version="1") # Split into train and vali in script
# data_path = train_data_asset.id

In [7]:
# from azure.ai.ml import command
# from azure.ai.ml import Input
# import os
# import json

# # Pass hyperparameter values
# param = {
#     "iterations": 30,
#     "learning_rate": 0.00025,
#     "img_per_batch": 3, # the real "batch size"
#     "roi_batch_size": 16, # the ROI head "batch size"
#     "dataset": data_set.replace("/", "-")
# }

# # Serialize the dictionary to a JSON file
# with open('param.json', 'w') as json_file:
#     json.dump(param, json_file)

# # Read the contents of param.json to set description of job
# with open('param.json', 'r') as json_file:
#     description = json.load(json_file)


In [None]:
from azure.ai.ml import command
from azure.ai.ml import Input

# Set job and run
job = command(
     code=".", 
     command='python detectron2_train2.py --traindata ${{inputs.train_data}} --iterations ${{inputs.iterations}} --learning_rate ${{inputs.learning_rate}} --img_per_batch ${{inputs.img_per_batch}} --roi_batch_size ${{inputs.roi_batch_size}}' ,
     inputs={
         "train_data": Input(path=data_path,
             type=AssetTypes.URI_FOLDER,
             mode=InputOutputModes.RO_MOUNT # Mount (.RO_MOUNT) or download (.DOWNLOAD)
         ),
         "iterations": 100,
         "learning_rate": 0.00025,
         "img_per_batch": 3, # the real "batch size"
         "roi_batch_size": 16, # the ROI head "batch size"
     },
     compute=compute_target,
     environment=env_name,
     outputs={},  # Add any required outputs here
     description="Detectron2 training. See job yaml for parameters."
 )

ml_client.jobs.create_or_update(job)

In [10]:
from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy

job_for_sweep = job(
    iterations=Choice(values=[30,100]),
    learning_rate=Uniform(min_value=0.0001, max_value=0.0002),
    img_per_batch=Choice(values=[3]),
    roi_batch_size=Choice(values=[16,32]),
)

In [None]:
# apply the sweep parameter to obtain the sweep_job
sweep_job = job_for_sweep.sweep(
    compute=compute_target,
    sampling_algorithm="random",
    primary_metric="vali_loss",
    goal="Minimize",
)

# define the limits for this sweep
sweep_job.set_limits(max_total_trials=20, max_concurrent_trials=10, timeout=7200)

# submit the sweep
ml_client.create_or_update(sweep_job)
